diff --git a/.circleci/config.yml b/.circleci/config.yml
index 19c2d377a..0193fc253 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -18,45 +18,91 @@ setup_env: &setup_env
   - run:
       name: Setup environment
       command: |
-        python3.8 --version
-        python3.8 -m pip install --upgrade pip
-        cd python
-        python3.8 setup.py bdist_wheel
-        sudo python3.8 -m pip install --no-input dist/*.whl
-        cd ..
-        python3.8 -m pip install pytest
-        python3.8 -m pip install torch
-        python3.8 -m pip install numpy
-        python3.8 -m pip install jinja2
-        python3.8 -m pip install recordtype
-        python3.8 -m pip install parameterized
-        python3.8 -m pip install einops
-        git submodule sync
-        git submodule update --init
-        echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV
-        echo 'export PATH=/usr/local/cuda-11.4/bin:$PATH' >> $BASH_ENV
-        echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV
-        echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV
+        for i in {1..3}; do
+          sudo update-alternatives --set cuda /usr/local/cuda-11.4
+          echo 'export PATH=/usr/local/cuda/bin:$PATH' >> $BASH_ENV &&
+          source "$BASH_ENV"
+          python3.8 --version &&
+          python3.8 -m pip install --upgrade pip &&
+          cd /home/circleci/project/python &&
+          python3.8 setup.py bdist_wheel &&
+          sudo python3.8 -m pip install --no-input dist/*.whl &&
+          cd /home/circleci/project &&
+          python3.8 -m pip install 'cuda-python<12.0.0' &&
+          python3.8 -m pip install pytest &&
+          python3.8 -m pip install torch &&
+          python3.8 -m pip install numpy &&
+          python3.8 -m pip install jinja2 &&
+          python3.8 -m pip install sympy &&
+          python3.8 -m pip install recordtype &&
+          python3.8 -m pip install parameterized &&
+          python3.8 -m pip install einops &&
+          git submodule sync &&
+          git submodule update --init &&
+          echo 'export PYTHONPATH=$PWD/python:$PYTHONPATH' >> $BASH_ENV &&
+          echo 'export CI_FLAG=CIRCLECI' >> $BASH_ENV &&
+          echo 'export CACHE_DIR=$PWD/tests/ci_profile_cache' >> $BASH_ENV &&
+          echo 'export LOGLEVEL=DEBUG' >> $BASH_ENV &&
+          break || sleep 5;
+        done
+
+
+setup_fx2ait_env: &setup_fx2ait_env
+  - run:
+      name: Setup fx2ait environment
+      command: |
+        for i in {1..3}; do
+          wget https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
+          tar -xvf cudnn-*-archive.tar.xz
+          sudo cp cudnn-*-archive/include/cudnn*.h /usr/local/cuda/include
+          sudo cp -P cudnn-*-archive/lib/libcudnn* /usr/local/cuda/lib64
+          sudo chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn*
+          python3.8 -m pip install --ignore-installed --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+          pushd fx2ait
+          python3.8 setup.py develop --user
+          popd
+          break || sleep 5;
+        done
 
 basic_tests: &basic_tests
   - run:
       name: Run tests
+      no_output_timeout: 20m
       command: |
         set -e
         TEST_FILES=$(circleci tests glob "tests/unittest/**/test_*.py" | grep -v benchmark | circleci tests split --split-by=timings)
-        mkdir test-results
-        python3.8 -m pytest $TEST_FILES --junitxml=test-results/junit.xml --verbose --continue-on-collection-errors -rA
+        mkdir ~/test-results
+        python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-results/junit.xml --verbose --continue-on-collection-errors -rA
 
+fx2ait_tests: &fx2ait_tests
+  - run:
+      name: Run fx2ait tests
+      command: |
+        source $BASH_ENV
+        mkdir -p ~/test-fx2ait-results
+        TEST_FILES=$(circleci tests glob "fx2ait/fx2ait/test/test_*.py" "fx2ait/fx2ait/test/converters/**/test_*.py")
+        python3.8 -m pytest $TEST_FILES -o junit_family=xunit1 --junitxml=~/test-fx2ait-results/junit.xml --verbose --continue-on-collection-errors -rA
 
 # Define a job to be invoked later in a workflow.
 # See: https://circleci.com/docs/2.0/configuration-reference/#jobs
 jobs:
+  fx2ait-test:
+    machine:
+      image: linux-cuda-11:default
+      resource_class: gpu.nvidia.medium
+    steps:
+      - checkout
+      - <<: *setup_env
+      - <<: *setup_fx2ait_env
+      - <<: *fx2ait_tests
+      - store_test_results:
+          path: ~/test-fx2ait-results
+
   build-and-test:
     machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
+      image: linux-cuda-11:default
       # Check T101565170 for multi-gpu use cases.
       resource_class: gpu.nvidia.medium
-
     parallelism: 10
 
     # Checkout the code as the first step. This is a dedicated CircleCI step.
@@ -69,7 +115,7 @@ jobs:
       - <<: *setup_env
       - <<: *basic_tests
       - store_test_results:
-          path: test-results
+          path: ~/test-results
 
 # Invoke jobs via workflows
 # See: https://circleci.com/docs/2.0/configuration-reference/#workflows
@@ -77,4 +123,5 @@ workflows:
   unittest: # This is the name of the workflow, feel free to change it to better match your workflow.
     # Inside the workflow, you define the jobs you want to run.
     jobs:
+      - fx2ait-test
       - build-and-test
diff --git a/.flake8 b/.flake8
index 71a5883ed..9ef66bc0d 100644
--- a/.flake8
+++ b/.flake8
@@ -7,111 +7,111 @@ ignore =
   # Found in https://github.com/psf/black/issues/429
   # Line too long.
   B950,
-  # Indentation is not a multiple of four. 
-  E111, 
+  # Indentation is not a multiple of four.
+  E111,
   # Expected an indented block (comment).
-  E115, 
+  E115,
   # Over-indented.
   E117,
-  # Continuation line under-indented for hanging indent. 
+  # Continuation line under-indented for hanging indent.
   E121,
-  # Continuation line missing indentation or outdented. 
+  # Continuation line missing indentation or outdented.
   E122,
-  # Closing bracket does not match indentation of opening bracket's line. 
+  # Closing bracket does not match indentation of opening bracket's line.
   E123,
-  # Closing bracket does not match visual indentation. 
+  # Closing bracket does not match visual indentation.
   E124,
-  # Continuation line with same indent as next logical line. 
+  # Continuation line with same indent as next logical line.
   E125,
-  # Continuation line over-indented for hanging indent. 
+  # Continuation line over-indented for hanging indent.
   E126,
-  # Continuation line over-indented for visual indent. 
+  # Continuation line over-indented for visual indent.
   E127,
-  # Continuation line under-indented for visual indent. 
+  # Continuation line under-indented for visual indent.
   E128,
-  # Visually indented line with same indent as next logical line. 
+  # Visually indented line with same indent as next logical line.
   E129,
-  # Continuation line unaligned for hanging indent. 
+  # Continuation line unaligned for hanging indent.
   E131,
-  # Whitespace after '('. 
+  # Whitespace after '('.
   E201,
-  # Whitespace before ')'. 
+  # Whitespace before ')'.
   E202,
-  # Whitespace before ':'. 
+  # Whitespace before ':'.
   E203,
-  # Multiple spaces before operator. 
+  # Multiple spaces before operator.
   E221,
-  # Multiple spaces after operator. 
+  # Multiple spaces after operator.
   E222,
-  # Missing whitespace around operator. 
+  # Missing whitespace around operator.
   E225,
-  # Missing whitespace around arithmetic operator. 
+  # Missing whitespace around arithmetic operator.
   E226,
-  # Missing whitespace around bitwise or shift operator. 
+  # Missing whitespace around bitwise or shift operator.
   E227,
-  # Missing whitespace after ',', ';', or ':'. 
+  # Missing whitespace after ',', ';', or ':'.
   E231,
-  # Multiple spaces after ','. 
+  # Multiple spaces after ','.
   E241,
-  # Unexpected spaces around keyword / parameter equals. 
+  # Unexpected spaces around keyword / parameter equals.
   E251,
-  # Missing whitespace around parameter equals. 
+  # Missing whitespace around parameter equals.
   E252,
-  # At least two spaces before inline comment. 
-  E261, 
+  # At least two spaces before inline comment.
+  E261,
   # Inline comment should start with '# '.
-  E262, 
+  E262,
   # Block comment should start with '# '.
   E265,
-  # Multiple spaces after keyword. 
+  # Multiple spaces after keyword.
   E271,
-  # Multiple spaces before keyword. 
+  # Multiple spaces before keyword.
   E272,
-  # Expected 1 blank line, found 0. 
+  # Expected 1 blank line, found 0.
   E301,
-  # Expected 2 blank lines, found 0. 
+  # Expected 2 blank lines, found 0.
   E302,
-  # Too many blank lines (3). 
+  # Too many blank lines (3).
   E303,
-  # Expected 2 blank lines after end of function or class. 
+  # Expected 2 blank lines after end of function or class.
   E305,
-  # Expected 1 blank line before a nested definition. 
+  # Expected 1 blank line before a nested definition.
   E306,
-  # Line too long (82 > 79 characters). 
+  # Line too long (82 > 79 characters).
   E501,
-  # The backslash is redundant between brackets. 
+  # The backslash is redundant between brackets.
   E502,
-  # Multiple statements on one line (colon). 
+  # Multiple statements on one line (colon).
   E701,
-  # Multiple statements on one line (semicolon). 
+  # Multiple statements on one line (semicolon).
   E702,
-  # Statement ends with a semicolon. 
+  # Statement ends with a semicolon.
   E703,
-  # Multiple statements on one line (def). 
+  # Multiple statements on one line (def).
   E704,
-  # Trailing whitespace. 
+  # Trailing whitespace.
   W291,
-  # No newline at end of file. 
+  # No newline at end of file.
   W292,
-  # Blank line contains whitespace. 
+  # Blank line contains whitespace.
   W293,
-  # Blank line at end of file. 
+  # Blank line at end of file.
   W391,
-  # Line break occurred after a binary operator. 
-  W504, 
+  # Line break occurred after a binary operator.
+  W504,
 
   # Too opinionated.
   # Block comment should start with '# '.
   E265,
-  # Too many leading '#' for block comment. 
+  # Too many leading '#' for block comment.
   E266,
-  # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports) 
-  E402, 
+  # Module level import not at top of file. (Use cases like demandimport https://fburl.com/demandimport require statements before imports)
+  E402,
   # Do not use bare except, specify exception instead. (Duplicate of B001)
-  E722, 
+  E722,
   # (Duplicate of B003)
-  P207, 
+  P207,
   # (Duplicate of C403)
   P208,
   # Line break occurred before a binary operator.
-  W503  
+  W503
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
new file mode 100644
index 000000000..3ebf6640d
--- /dev/null
+++ b/.github/workflows/docs.yaml
@@ -0,0 +1,34 @@
+name: Docs
+
+on:
+  push:
+    branches:
+      - main
+
+  pull_request:
+    branches:
+      - main
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python3.9 -m pip install --upgrade pip
+        python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox
+        cd python
+        python setup.py develop
+        cd ..
+    - name: Build documents with Sphinx
+      run: |
+        cd docs
+        make html
+        cd ..
diff --git a/.github/workflows/docs.yml b/.github/workflows/pages.yaml
similarity index 80%
rename from .github/workflows/docs.yml
rename to .github/workflows/pages.yaml
index 208bd1f77..815cfd887 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/pages.yaml
@@ -1,5 +1,5 @@
 # Simple workflow for deploying static content to GitHub Pages
-name: Documentation
+name: Deploy docs to Pages
 
 on:
   # Runs on pushes targeting the default branch
@@ -39,15 +39,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install autodocsumm
-          pip install sphinx_rtd_theme
-          pip install sphinx_gallery
-          pip install sphinxcontrib-inlinesyntaxhighlight
-          pip install sphinx_toolbox
-          pip install numpy
-          pip install jinja2
-          pip install torch
+          python3.9 -m pip install --upgrade pip
+          python3.9 -m pip install numpy autodocsumm 'sphinx<6' sphinx_rtd_theme sphinx_gallery sphinxcontrib-inlinesyntaxhighlight sphinx_toolbox jinja2 torch
           cd python
           python setup.py develop
           cd ..
diff --git a/.github/workflows/lint.yml b/.github/workflows/pylint.yaml
similarity index 85%
rename from .github/workflows/lint.yml
rename to .github/workflows/pylint.yaml
index dbd4beb83..91f0018eb 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/pylint.yaml
@@ -23,9 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ufmt
-        pip install click
-        pip install flake8
+        pip install ufmt==2.0.1 click==8.1.3 black==22.12.0 flake8==5.0.4
     - name: Analyzing the code with flake8
       run: |
         echo "::add-matcher::tests/lint/flake8_problem_matcher.json"
@@ -38,4 +36,5 @@ jobs:
     - name: Check Meta copyright header
       run: |
         python tests/lint/check_meta_header.py --path=./tests --fixit=False
-        python tests/lint/check_meta_header.py --path=./python --fixit=False
\ No newline at end of file
+        python tests/lint/check_meta_header.py --path=./python --fixit=False
+        python tests/lint/check_meta_header.py --path=./fx2ait --fixit=False
diff --git a/.github/workflows/ait_ci.yml b/.github/workflows/rocm_ci.yml
similarity index 93%
rename from .github/workflows/ait_ci.yml
rename to .github/workflows/rocm_ci.yml
index 0f598865f..61c93d643 100644
--- a/.github/workflows/ait_ci.yml
+++ b/.github/workflows/rocm_ci.yml
@@ -1,10 +1,12 @@
-name: AITemplate_ci
+name: ROCM_CI
 
-on:
-  push:
+on: 
+  pull_request:
+    types: [labeled, synchronize, reopened]
 
 jobs:
   build:
+    if: contains(github.event.label.name, 'rocm')
     runs-on: rocm
 
     steps:
@@ -33,7 +35,7 @@ jobs:
         rocm-smi
         rocminfo | grep "gfx"
         export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
-        git clone --recursive -b $GIT_BRANCH https://github.com/ROCmSoftwarePlatform/AITemplate.git
+        git clone --recursive -b $GIT_BRANCH https://github.com/facebookincubator/AITemplate.git
         cd AITemplate
         DOCKER_BUILDKIT=1 ./docker/build.sh rocm
         docker run --network=host --device=/dev/kfd --device=/dev/dri --ipc=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $HOME:/dockerx/ ait:latest
@@ -124,8 +126,9 @@ jobs:
         git show --summary | grep commit >> sdiff.log
         /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> sdiff.log
         # profiling
-        HIP_VISIBLE_DEVICES=0,1 python3 compile.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log
-        HIP_VISIBLE_DEVICES=0 python3 demo.py --token ${{ secrets.HF_TOKEN }} --benchmark 1 2>&1 | tee -a sdiff.log
+        python3 scripts/download_pipeline.py --token ${{ secrets.HF_TOKEN }} 2>&1 | tee -a sdiff.log
+        HIP_VISIBLE_DEVICES=0,1 python3 scripts/compile.py 2>&1 | tee -a sdiff.log
+        HIP_VISIBLE_DEVICES=0 python3 scripts/demo.py --benchmark 1 2>&1 | tee -a sdiff.log
     - name: Archive logs
       uses: actions/upload-artifact@v3
       with:
@@ -140,4 +143,3 @@ jobs:
         export dbuser=${{ secrets.DBUSER }}
         export dbpassword=${{ secrets.DBPASSWORD }}
         python3 process_results.py 
-
diff --git a/.gitignore b/.gitignore
index f3bbc0889..8897298b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,6 +136,9 @@ tags
 # macOS dir files
 .DS_Store
 
+# PyCharm files
+.idea
+
 # vscode
 .vscode
 
diff --git a/.gitmodules b/.gitmodules
index a82a39064..1272127de 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/AITemplate/cutlass.git
+	url = https://github.com/facebookincubator/cutlass-fork.git
 [submodule "3rdparty/cub"]
 	path = 3rdparty/cub
 	url = https://github.com/NVIDIA/cub.git
@@ -8,3 +8,6 @@
 	path = 3rdparty/composable_kernel
 	url = https://github.com/ROCmSoftwarePlatform/composable_kernel.git
 	branch = develop
+[submodule "3rdparty/picojson"]
+	path = 3rdparty/picojson
+	url = https://github.com/kazuho/picojson.git
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 52abc2f37..78eb3f0b4 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 52abc2f37112d49f85f31aa343a14bd92a83b07c
+Subproject commit 78eb3f0b46aafc52c6d19a07b9dc5bd19b8e7807
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
index f434be22a..a9d9b8049 160000
--- a/3rdparty/cutlass
+++ b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit f434be22a6270f9f000712286f92545ccca045b7
+Subproject commit a9d9b80493e20086732f51f90f10f99ae50ae5ed
diff --git a/3rdparty/picojson b/3rdparty/picojson
new file mode 160000
index 000000000..111c9be51
--- /dev/null
+++ b/3rdparty/picojson
@@ -0,0 +1 @@
+Subproject commit 111c9be5188f7350c2eac9ddaedd8cca3d7bf394
diff --git a/README.md b/README.md
index 38330592d..907e9d3bc 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
 # AITemplate
 
 [![License](https://img.shields.io/badge/License-Apache_2.0-brightgreen.svg)](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE) |
-[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
+[![Documentation](https://github.com/facebookincubator/AITemplate/actions/workflows/docs.yaml/badge.svg)](https://facebookincubator.github.io/AITemplate) |
 [![CircleCI](https://circleci.com/gh/facebookincubator/AITemplate.svg?style=svg)](https://app.circleci.com/pipelines/github/facebookincubator/AITemplate)
-
-
+[![Deploy docs to Pages](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml/badge.svg)](https://github.com/facebookincubator/AITemplate/actions/workflows/pages.yaml)
 
 
 AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
 
 - High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
-- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+- Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+
 
 ## More about AITemplate
 
@@ -24,32 +24,48 @@ AITemplate provides unique advanced horizontal fusion. AITemplate can fuse paral
 
 ### Vertical Fusion
 
-AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reduction operations, and layout permutation operations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion.
+AITemplate provides strong vertical fusion. AITemplate can fuse a large range of operations into TensorCore/MatrixCore operations, such as elementwise operations, reductions, and layout permutations. AITemplate also provides back-to-back style TensorCore / MatrixCore operation fusion.
 
 ### Memory Fusion
 
 AITemplate provides innovative memory fusions. AITemplate can fuse GEMM, LayerNorm, and other operators, followed by memory operations such as concatenation, split, and slice into a single operator.
 
 ### Working w/wo PyTorch
+
 The AITemplate-generated Python runtime can take PyTorch tensors as inputs and outputs without an extra copy. For environments without PyTorch, the AITemplate Python/C++ runtime is self-contained.
 
 ### Extensions without suffering
 
 AITemplate provides a straightforward approach for making an extension in codegen. To add a new operator or a new fused kernel into AITemplate, most of the time one only needs to add two Python files: one for a graph node definition and another for the backend codegen. The CUDA/HIP kernel in a text header file can be directly utilized in the codegen.
 
+
+## FX2AIT
+
+FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving. Using FX2AIT's built-in AITLowerer, partial AIT acceleration can be achieved for models with unsupported operators in AITemplate.
+
+Key features of FX2AIT include:
+
+* Easy Conversion: FX2AIT requires only a PyTorch model and input for conversion, generating an "AITModule" output for inference serving.
+* Expanded Support: AITemplate does not support all PyTorch operators. FX2AIT's AITLowerer offers a solution for partial AIT conversion for models with unsupported operators. Check the `fx2ait/fx2ait/example/03_lowering_split` for more information.
+
+More info can be found from https://github.com/facebookincubator/AITemplate/tree/main/fx2ait.
+
+
 ## Installation
 
-**Hardware requirement:**
+**Hardware requirements:**
   - **NVIDIA**: AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
   - **AMD**:  AIT is only tested on CDNA2 (MI-210/250) GPUs. There may be compiler issues for old CDNA1 (MI-100) GPUs.
 
-## Clone the code
+### Clone the code
+
 When cloning the code, please use the following command to also clone the submodules:
 ```
 git clone --recursive https://github.com/facebookincubator/AITemplate
 ```
 
 ### Docker Image
+
 We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC.
 - CUDA: `./docker/build.sh cuda`
 - ROCM: `DOCKER_BUILDKIT=1 ./docker/build.sh rocm`
@@ -57,6 +73,7 @@ We highly recommend using AITemplate with Docker to avoid accidentally using a w
 This will build a docker image with tag `ait:latest`.
 
 ### From Source
+
 The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA/ROCm compiler installed.
 - CUDA: CUDA 11.6
 - ROCm: We tested on ROCm 5.2.3 with a customized build HIPCC with the command in docker/Dockerfile.rocm#L87-L96
@@ -83,45 +100,48 @@ There are a few tutorials for onboarding:
 
 
 ## Examples & Performance
-AITemplate provides the following model templates & reference performance data on A100/MI-250
+
+AITemplate provides the following model templates & reference performance data on A100/MI-250:
 
 - [01_ResNet-50](examples/01_resnet-50/) with PyTorch Image Models (TIMM)
 - [02_MaskRCNN-FPN](examples/02_detectron2/) with Detectron2
-- [03_BERT](examples/03_bert/) with HuggingFace Transformer
+- [03_BERT](examples/03_bert/) with Hugging Face Transformer
 - [04_Vision Transformer](examples/04_vit/) with PyTorch Image Models (TIMM)
-- [05_Stable Diffusion](examples/05_stable_diffusion/) with HuggingFace Diffusers
+- [05_Stable Diffusion](examples/05_stable_diffusion/) with Hugging Face Diffusers
 
 ## Release
 
-AITemplate has a 90 days release cycle.
-In the next one or two releases, we will focus on:
-- Deprecating FlashAttention: Unify CUDA Attention computation to Composable Kernel (AMD GPU) style back-to-back fusion to improve performance and increase flexibility for NVIDIA GPU Transformer users.
-- Remove kernel profiling requirement.
-- GEMM + LayerNorm fusion, GEMM + GEMM fusion, Conv + Conv fusion.
-- Better dynamic shape support: Focus on the dynamic sequence in Transformers.
-- More model templates:  Provide model templates with control flow and containers.
+All current development updates can be seen in the AITemplate repository. Releases are not on a set schedule and will only be tagged for significant feature releases.
+
+Mid-term plan:
+- Better dynamic shape support: Focus on the dynamic sequence in Transformers. Add symbolic shape support.
 - More automatic graph passes: Relief manual rewrite models to obtain the best performance.
-- Enable more fusions on AMD backend.
+- Quantization: fp8/int8/int4.
+- Sparsity pruning for Gemm.
+- PT2 integration: Aten2AIT is under active development.
 
-Some ongoing/potential work that won't appear in the next short-term release:
-- Automatic Pytorch-FX, ONNX, Open-XLA and other format model conversion.
-- Quantized model (int8/fp8/int4) support.
+Long-term plan:
+- Automatic ONNX, Open-XLA and other format model conversion.
 - Composable Kernel CPU extension on AVX2/AVX-512 for AMD Epyc CPU.
 
 ## Contributing
+
 Check our [contributing guide](CONTRIBUTING.md) to learn about how to contribute to the project.
 
 ## The Team
 
-AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank the discussions with Andrew Tulloch, Yinghai Lu, Lu Fang.
+AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mu-Chu Lee](https://github.com/muchulee8), [Max Podkorytov](https://github.com/tenpercent), [Adnan Akhundov](https://github.com/aakhundov).
 
-AITemplate is currently maintained by Meta engineers: [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), [Terry Chen](https://github.com/terrychenism), [Mike Iovine](https://github.com/mikeiovine), [Mu-Chu Lee](https://github.com/muchulee8) and [Bing Xu](https://github.com/antinucleon).
+AITemplate is co-created by Meta engineers: [Bing Xu](https://github.com/antinucleon), [Ying Zhang](https://github.com/ipiszy), [Hao Lu](https://github.com/hlu1), [Yang Chen](https://github.com/chenyang78), and [Terry Chen](https://github.com/terrychenism), with major contributions coming from more talented engineers. A non-exhaustive list to mention is Mike Iovine, Mu-Chu Lee, Scott Wolchok, Oleg Khabinov, Shirong Wu, Huaming Li, Hui Guo, Zhijing Li, Max Podkorytov. We also want to thank Andrew Tulloch, Yinghai Lu, Lu Fang for the valuable discussions.
 
+FX2AIT and Aten2AIT are co-created and maintained by Meta engineers: [Wei Wei](https://github.com/frank-wei), [Shirong Wu](https://github.com/wushirong) and [Zhijing Li](https://github.com/tissue3).
 
-## Acknowledgement
 
-AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (Led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (Led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration.
+## Acknowledgements
+
+AITemplate team works deeply with NVIDIA [CUTLASS](https://github.com/NVIDIA/cutlass) Team (led by Andrew Kerr, Haicheng Wu) and AMD [Composable Kernel](https://github.com/ROCmSoftwarePlatform/composable_kernel) Team (led by Chao Liu, Jing Zhang). We co-designed many advanced GPU optimizations specialized for each platform, and nothing is possible without our close collaboration.
 
 
 ## License
+
 AITemplate is licensed under the [Apache 2.0 License](https://github.com/facebookincubator/AITemplate/blob/main/LICENSE).
diff --git a/default.nix b/default.nix
new file mode 100644
index 000000000..d521651e9
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,50 @@
+{ pkgs ? import <nixpkgs> {
+  config = {
+    allowUnfree = true;
+    cudaSupport = true;
+  };
+}}:
+
+let 
+  ait-deps = ps: with ps; [
+    pytorch-bin
+    pip
+    wheel
+    click
+    unidecode
+    inflect
+    librosa
+    jinja2
+    sympy
+    einops
+    parameterized
+    transformers
+    # (
+    #   buildPythonPackage rec {
+    #     pname = "cuda_python";
+    #     version = "12.1.0";
+    #     format = "wheel";
+    #     src = fetchPypi {
+    #       inherit pname version format;
+    #       sha256 = "94506d730baade1744767e2c05d5ddd84d7fbe4c9b6f694a54a3f376f7ffa525";
+    #       abi = "cp39";
+    #       python = "cp39";
+    #       platform = "manylinux_2_17_x86_64.manylinux2014_x86_64";
+    #     };
+    #     doCheck = false;
+    #   }
+    # )
+  ];  
+in
+pkgs.mkShell {
+  buildInputs = [
+    pkgs.cmake
+    pkgs.cudatoolkit
+    (pkgs.python310.withPackages ait-deps)
+  ];
+
+  shellHook = ''
+    export CUDA_PATH=${pkgs.cudatoolkit}
+    echo "You are now using a NIX environment"
+  '';
+}
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 0461f45bf..1d481809f 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -40,6 +40,9 @@ RUN bash /Install/install_doc_dep.sh
 # install Pytorch
 RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
 
+# install NVIDIA cuda-python
+RUN pip3 install 'cuda-python<12.0.0'
+
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
 RUN bash /Install/install_detection_deps.sh
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 8146b506c..f7ca24bc7 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -13,105 +13,30 @@
 #  limitations under the License.
 #
 # ROCM Docker Image for AITemplate
-FROM ubuntu:20.04
-
-ARG ROCMVERSION=5.3
-
-RUN set -xe
-
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
-# Add rocm repository
-RUN apt-get update
-RUN apt-get install -y wget gnupg
-RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
-RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
-RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
-RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"
-
-# Install dependencies
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    apt-utils \
-    build-essential \
-    cmake-data \
-    cmake \
-    curl \
-    git \
-    hip-rocclr \
-    jq \
-    libelf-dev \
-    libncurses5-dev \
-    libnuma-dev \
-    libpthread-stubs0-dev \
-    llvm-amdgpu \
-    pkg-config \
-    python \
-    python3 \
-    python-dev \
-    python3-dev \
-    python3-pip \
-    software-properties-common \
-    rocm-dev \
-    rocm-device-libs \
-    rocm-cmake \
-    rocm-libs \
-    vim \
-    zlib1g-dev \
-    openssh-server \
-    clang-format-10 \
-    kmod && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Setup ubsan environment to printstacktrace
-RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
-ENV UBSAN_OPTIONS=print_stacktrace=1
-
-# Install an init system
-RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
-RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
-
-ARG PREFIX=/opt/rocm
+FROM docker.io/rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1
 
 # Setup ubsan environment to printstacktrace
 ENV UBSAN_OPTIONS=print_stacktrace=1
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
-ADD ./docker/install/rocm_dev-requirements.txt dev-requirements.txt
 RUN groupadd -f render
 
-# Install the new rocm-cmake version
-RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
-  cd rocm-cmake && mkdir build && cd build && \
-  cmake  .. && cmake --build . && cmake --build . --target install
-
 WORKDIR /
 
-ADD ./docker/install/ /Install
+RUN git clone -b merge_upstream --recursive https://github.com/ROCmSoftwarePlatform/AITemplate.git
+
+WORKDIR /AITemplate
 # necessary package
-RUN bash /Install/install_basic_dep.sh
+RUN bash ./docker/install/install_basic_dep.sh
 
 # for test
-RUN bash /Install/install_test_dep.sh
+RUN bash ./docker/install/install_test_dep.sh
 
 # for docs
-RUN bash /Install/install_doc_dep.sh
-
-# Install Pytorch
-RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+RUN bash ./docker/install/install_doc_dep.sh
 
 # for detection
 RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
-RUN bash /Install/install_detection_deps.sh
+RUN bash ./docker/install/install_detection_deps.sh
 
-# Copy AITemplate to Docker
-RUN mkdir /AITemplate
-ADD ./COMMIT_INFO /AITemplate/COMMIT_INFO
-ADD ./python /AITemplate/python
-ADD ./3rdparty /AITemplate/3rdparty
-ADD ./examples /AITemplate/examples
-ADD ./tests /AITemplate/tests
-ADD ./docs /AITemplate/docs
-ADD ./static /AITemplate/static
-ADD ./licenses /AITemplate/licenses
-ADD ./docker/install/install_ait.sh /AITemplate/
-RUN bash /AITemplate/install_ait.sh
+RUN bash ./docker/install/install_ait.sh
diff --git a/docker/install/install_basic_dep.sh b/docker/install/install_basic_dep.sh
index 801ef53ef..18f37f628 100644
--- a/docker/install/install_basic_dep.sh
+++ b/docker/install/install_basic_dep.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
+apt install -y time
 pip3 install numpy
 pip3 install jinja2
diff --git a/docker/install/install_detection_deps.sh b/docker/install/install_detection_deps.sh
index 47238cd3c..e8b91f9d5 100644
--- a/docker/install/install_detection_deps.sh
+++ b/docker/install/install_detection_deps.sh
@@ -5,5 +5,5 @@ pip3 install yacs
 pip3 install opencv-python
 pip3 install tqdm
 pip3 install timm
-pip3 install transformers
-pip3 install diffusers
+pip3 install transformers==4.25.0
+pip3 install diffusers==0.24.0
\ No newline at end of file
diff --git a/docs/image/gpu_grid_block.png b/docs/image/gpu_grid_block.png
new file mode 100644
index 000000000..a486a5bf9
Binary files /dev/null and b/docs/image/gpu_grid_block.png differ
diff --git a/docs/image/pack_size_1.png b/docs/image/pack_size_1.png
new file mode 100644
index 000000000..b07bb5ff4
Binary files /dev/null and b/docs/image/pack_size_1.png differ
diff --git a/docs/image/pack_size_2.png b/docs/image/pack_size_2.png
new file mode 100644
index 000000000..6769b0691
Binary files /dev/null and b/docs/image/pack_size_2.png differ
diff --git a/docs/image/pack_size_4.png b/docs/image/pack_size_4.png
new file mode 100644
index 000000000..40deaa502
Binary files /dev/null and b/docs/image/pack_size_4.png differ
diff --git a/docs/image/pack_size_8.png b/docs/image/pack_size_8.png
new file mode 100644
index 000000000..804187fd9
Binary files /dev/null and b/docs/image/pack_size_8.png differ
diff --git a/docs/image/softmax.png b/docs/image/softmax.png
new file mode 100644
index 000000000..e2fc3c523
Binary files /dev/null and b/docs/image/softmax.png differ
diff --git a/docs/image/vs_oneflow.png b/docs/image/vs_oneflow.png
new file mode 100644
index 000000000..495bdd426
Binary files /dev/null and b/docs/image/vs_oneflow.png differ
diff --git a/docs/source/arch/philosophy.rst b/docs/source/arch/philosophy.rst
index 2eefb8f5d..d1ac35db4 100644
--- a/docs/source/arch/philosophy.rst
+++ b/docs/source/arch/philosophy.rst
@@ -5,12 +5,17 @@ Design  Philosophy
 KISS (Keep it simple and stupid)
 --------------------------------
 
-AITemplate avoids deep IR lowering stacks to reduce the system's complexity. A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference.
+AITemplate avoids deep IR lowering stacks to reduce the system's complexity.
+A highly modularized, multiple backend codegen system written in pure Python directly attacks the pain point in high-performance GPU inference.
 
 Pragmatism
 ----------
 
-AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization. Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN.
+AITemplate provides a PyTorch-style frontend to enable engineers to manually match the PyTorch model & weights to AITemplate for optimization.
+Using it is less painful than debugging different lowering IR stacks, especially for complex models such as MaskRCNN.
 
-
-We believe most of the neural network workload can be decoupled. For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics. For encoder and decoder, it is a computation bounded problem. For decoder logic, it may involve more control flows. By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack to play as the silver bullet.
\ No newline at end of file
+We believe most of the neural network workload can be decoupled.
+For example, most of the network can be decoupled into Encoder, Decoder, and Decoder logics.
+For encoder and decoder, it is a computation-bounded problem.
+For decoder logic, it may involve more control flows.
+By using divide and conquer, we left the decoder logic part to C++ or Python rather than build a unified language / IR stack as a silver bullet.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index bf239d5d1..51fbf50db 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = "AITemplate"
-copyright = "2022, Meta Platforms"
+copyright = "2022-2023, Meta Platforms"
 author = "Meta Platforms"
 
 # The full version, including alpha/beta/rc tags
-release = "0.1"
+release = "0.2"
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/debughints.rst b/docs/source/debughints.rst
index 074254a75..0bd07d3c1 100644
--- a/docs/source/debughints.rst
+++ b/docs/source/debughints.rst
@@ -1,14 +1,15 @@
 Debug Hints
 ===========
 
-AITemplate is a new project under active development. We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected.
+AITemplate is a new project under active development.
+We have a rich test set to avoid bugs but don't be surprised if there is anything unexpected.
 
-Here are some helpful tips when we learned during the development AITemplate:
+Here are some helpful tips we learned during the development of AITemplate:
 
-1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting ~/.aitemplate or setting environment variable FLUSH_PROFILE_CACHE=1
+1. Once the codegen for op which requires profiling is changed, remember to delete old profilers (usually located at workdir), and flush the cache by either deleting `~/.aitemplate` or setting the environment variable `FLUSH_PROFILE_CACHE=1`.
 
-2. Check the pseudo code/visualization generated by each optimization pass if some optimization is harmful.
+2. Check the pseudo code/visualization generated by each optimization pass if some optimization behaves in unexpected way.
 
 3. Always do the numerical test, from small to large, to make sure the entire model is correct.
 
-4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph.
\ No newline at end of file
+4. Try to make the new fusion subgraph work in a manual way, then try to add an automatic pass to rewrite the graph with the fused subgraph.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 775d33792..9dbcdcc9a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,11 +1,11 @@
 
 AITemplate Documentation
-======================================
+========================
 
 AITemplate (AIT) is a Python framework that transforms deep neural networks into CUDA (NVIDIA GPU) / HIP (AMD GPU) C++ code for lightning-fast inference serving. AITemplate highlights include:
 
 * High performance: close to roofline fp16 TensorCore (NVIDIA GPU) / MatrixCore (AMD GPU) performance on major models, including ResNet, MaskRCNN, BERT, VisionTransformer, Stable Diffusion, etc.
-* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easy extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
+* Unified, open, and flexible. Seamless fp16 deep neural network models for NVIDIA GPU or AMD GPU. Fully open source, Lego-style easily extendable high-performance primitives for new model support. Supports a significantly more comprehensive range of fusions than existing solutions for both GPU platforms.
 
 
 .. toctree::
diff --git a/docs/source/install/index.rst b/docs/source/install/index.rst
index 48244cfa7..6e684ab03 100644
--- a/docs/source/install/index.rst
+++ b/docs/source/install/index.rst
@@ -7,11 +7,11 @@ Using Docker
 The easiest way to get started is to use Docker.  Using docker is able to avoid performance regression caused by incorrect version of NVCC and HIPCC.
 To use docker, we provide a bash script to build the docker image.
 
-- CUDA: 
+- CUDA:
     .. code-block:: bash
 
         ./docker/build.sh cuda
-- ROCM: 
+- ROCM:
     .. code-block:: bash
 
         DOCKER_BUILDKIT=1 ./docker/build.sh rocm
@@ -31,13 +31,13 @@ To launch the docker container
 
         docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add=video --ipc=host --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ait:latest
 
-AITemplate will be installed in as a Python package to Python 3.8. There will be also a copy of source code and examples at `/AITemplate`
+AITemplate will be installed as a Python package in Python 3.8. There will be also a copy of the source code and examples at `/AITemplate`.
 
 
-Install as standard Python package
-----------------------------------
+Installing as a Standard Python Package
+---------------------------------------
 
-Before start installing AITemplate, first make sure you have correct hardware and software environment.
+Before installing AITemplate, first make sure you have correct hardware and software environment.
 
 - Hardware
     - NVIDIA: AIT is only tested on SM80+ GPUs (Ampere etc).
@@ -52,24 +52,23 @@ Before start installing AITemplate, first make sure you have correct hardware an
     - AMD: ROCm 5.2, with HIPCC 10736 (commit `b0f4678b9058a4ae00200dfb1de0da5f2ea84dcb`)
 
 .. warning::
-    - Incorrect compiler version will lead performance regression.
-    - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`
+    - Incorrect compiler version may lead to performance regression.
+    - Instruction for building HIPCC 10736 can be founded in `docker/Dockerfile.rocm`.
 
 
-When clone the code, please use the following command to clone the submodules:
-```
-git clone --recursive https://github.com/facebookincubator/AITemplate
-```
+When cloning the code, please use the following command to clone the submodules:
+
+    .. code-block:: bash
+
+        git clone --recursive https://github.com/facebookincubator/AITemplate
 
 .. warning::
-    Please check all submodules are cloned correctly before go to next step.
+    Please check that all submodules are cloned correctly before the next step.
 
-Then build Python wheel package and install.
+Then build the Python wheel package and install it:
 
     .. code-block:: bash
 
         cd python
         python setup.py bdist_wheel
-        pip install dist/aitemplate-0.0.1-py3-none-any.whl
-
-
+        pip install dist/aitemplate-*.whl
diff --git a/docs/source/reference/env.rst b/docs/source/reference/env.rst
index 1342becf6..392999b33 100644
--- a/docs/source/reference/env.rst
+++ b/docs/source/reference/env.rst
@@ -1,13 +1,40 @@
 Environment Variables
 =====================
-AITemplate uses environment variables to control the behavior of codegen and profiling. All the environment variables used in AITemplate are listed here.
+AITemplate uses environment variables to control the behavior of codegen and profiling.
+The environment variables used in AITemplate are listed here.
 
 Codegen
 -------
 
 **NUM_BUILDERS**: The number of CPU jobs running in parallel during codegen. It controls both the profiler codegen and the final .so codegen. It's set to 12 in NIGHTLY jobs. Internally, it's set to 12 for normal tests and 24 for heavy tests. By default, the builder uses all the available CPUs for building.
 
-**RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
+**AIT_RECOMPILE**: If set to "0", it skips compilation for the .so and reuses the previously compiled ones. It is used to speed up local testing. The default value is "1" to always recompile.
+
+**AIT_NDEBUG**: If set to "1", compile with `NDEBUG`, disabling debug assertions. Recommended for production builds. "1" by default.
+
+**AIT_COMPILER_OPT**: The optimization level for a compiler, which is directly passed to the host compiler command line. AITemplate host code may be very light in certain cases, so there is nothing to optimize for a host compiler. Thus, there is no need to make host compiler perform time costly optimizations. It may be very useful to use "-O0" value for debugging GPU kernels. "-O3" by default.
+
+**AIT_TIME_COMPILATION**: If set to "1", time each make command at the compilation time. This helps us to do compilation time analysis. Requires to install `time <https://man7.org/linux/man-pages/man1/time.1.html>`_ package.
+
+**AIT_MULTISTREAM_MODE**: Controls multi-stream mode. Default mode is "0".
+* If set to "0", then no multistreaming is used.
+* If set to "1", then a simple multistreaming is used (iteratively track a wavefront of independent operators and execute ones).
+
+**AIT_MULTISTREAM_EXTRA_STREAMS**: Specifies the number of additional streams used. Default value is "4".
+
+**AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS**: Maximum number of parallel operators used in memory planning for simple multi-stream mode. Default value is "99999999" (basically, unlimited).
+
+**AIT_USE_CMAKE_COMPILATION**: (An experimental feature) If set to "1", then `cmake` will used instead of `make`. This allows to build AITemplate using MSVC Compiler + MSBuild on Windows, and it works for linux as well. This builder does not support many features (such as caching) yet. But it allows to generate a cmake project that can be loaded to a modern IDE. Default value is "0".
+
+**AIT_ENABLE_STANDALONE**: Enable standalone test and benchmark executable generation. Default value is "0" (disabled). If set to "1", this will generate a "test" executable that may be used to run standalone tests and benchmarks. This standalone executable is also well suited for running through debuggers and/or profiling tools, as it does not pull in python and pytorch as dependencies, unlike most python unit tests.
+
+**AIT_ENABLE_PTXAS_INFO**: Set this to "1" to enable the generation and logging of verbose ( tuning-relevant ) information about CUDA ptx assembly code produced by the CUDA compiler nvcc. Intermediate ptx files, annotated with C++ source info will be written to the build directory. In addition, this flag enables warnings about CUDA register spilling and resource usage.
+
+**AIT_CUDA_DEBUG_LEVEL**: Configure level of CUDA debug information. Defaults to no debug info. This may either be a string with options passed to nvcc ( for example "-g -G" or "-lineinfo" ) or a CUDA debug level from "0" (default, no debug info), "1" ( "-lineinfo" ) include source code line information. Ideal for profiling with ncu/nsight-compute, "2" full debug information (**warning**: this disables all optimizations, regardless of other settings)
+
+**AIT_ENABLE_CUDA_SOURCE_NAVIGATION_FIX**: (Only supported by FBCUDA target so far): When this flag is enabled by setting it to "1" (it is disabled by default), every *.cu file in build dirs into a corresponding *.cu.h file and create a *.cu file which just includes this file. This fixes code navigation issues in some IDE's which don't treat .cu files as C++ files and disable code navigation.
+
+**AIT_ENABLE_INCLUDE_FROM_SOURCETREE**: (Only supported by FBCUDA target so far) When this flag is enabled by setting it to "1" (it is disabled by default), the target will create an in-place build which tries to directly reference the include paths within the AITemplate source tree. This helps to iterate faster during native Kernel/Operator development and debugging.
 
 Profiling
 ---------
@@ -22,16 +49,24 @@ Profiling
 
 **HIP_VISIBLE_DEVICES**: This one is from ROCm itself. It's used to set the number of GPU devices available for profiling. Set to "0,1,2,3,4,5,6,7" to speed up profiling. For benchmarking, it's useful to set to a particular device to lower noise.
 
-**FORCE_PROFILE**: If set to "1", it will do profiling regarless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
+**FORCE_PROFILE**: If set to "1", it will do profiling regardless in_ci_env and disable_profiler_codegen. For non-NIGHTLY CI, we do not do profiling, and we could use FORCE_PROFILE=1 in these CI to do runs with codegen, compile, and profile.
+
+**COMBINE_PROFILER_MULTI_SOURCES**: Whether to combine multiple profiler sources per target. "0" - Disabled, "1" - Enabled (default).
+
+**FORCE_ONE_PROFILER_SOURCE_PER_TARGET**: Whether to combine multiple profiler sources per target into one. "0" - Disabled (default), "1" - Enabled.
 
 OSS CI
 ------
 
-**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compiles two random profilers to make sure the profiler codegen is not broken.
+**CI_FLAG**: It is set to "CIRCLECI" in OSS CI to indicate we're in OSS CI environment. The behavior of the profiler and codegen is different in CI to speed up testing. Profiling itself for gemm/conv ops is disabled in CI. But we still compile two random profilers to make sure the profiler codegen is not broken.
 
-**BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
+**AIT_BUILD_DOCS**: If set to "1", it will create a fake CUDA target to enable doc building in Github Actions.
 
 Miscellaneous
 -------------
 
-**LOGLEVEL**: It is used to control the logging level in python. It's default to "INFO". "DEBUG" is useful for debugging.
+**LOGLEVEL**: It is used to control the logging level in Python. The default value is "INFO". "DEBUG" is useful for debugging.
+
+**AIT_PLOT_SHORTEN_TENSOR_NAMES**: If set to "1", shorten too long tensor names for a plot of a model graph, thus making a plot much easier to analyze visually. "0" by default.
+
+**AIT_USE_FAST_MATH**: If set to "0", no fast math option will be used for the device code generation. Default value is "1".
diff --git a/docs/source/runtime/cxx_design.rst b/docs/source/runtime/cxx_design.rst
index 5ef18f889..d4608409f 100644
--- a/docs/source/runtime/cxx_design.rst
+++ b/docs/source/runtime/cxx_design.rst
@@ -1,29 +1,30 @@
-==================
+================
 C++ Runtime Note
-==================
+================
 
 `Model` v.s. `ModelContainer`
-==============================
+=============================
 
-These are the two main classes involved in the C++ runtime implementation.
+These are the two main classes involved in the C++ runtime implementation:
 
-* The bulk of the runtime implementation is in `Model`.
-* `ModelContainer` stores a set of shared constants and a collection of `Model`s. Almost all functions in `model_interface.h` forward to a method on `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one is available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
+* The bulk of the runtime implementation is in the `Model` class.
+* The `ModelContainer` class stores a set of shared constants and a collection of `Model` instances. Almost all functions in `model_interface.h` forward to a method in `ModelContainer`. When `Run` is invoked, `ModelContainer` looks for an available `Model`, or blocks until one becomes available (see the section on asynchronous predictions). It then forwards the run request to the runtime.
 
 Code Structure
 ==============
 
 Some important files:
 
-1. `include/model_interface.h`: The interface that we expose in the compiled .so
+1. `include/model_interface.h`: The interface that we expose in the compiled `.so`.
 2. `include/model_container.h`: The bulk of the `ModelContainer` implementation.
 
 Some files are generated at compile time. These include:
 
-* `model-generated.h`: The implementation for `Model`.
-* `model_container_base.cu`: A small part of the implementation for `ModelContainer` needs to be codegened. So `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
+* `model-generated.h`: The implementation of the `Model`.
+* `model_container_base.cu`: A small part of the implementation for `ModelContainer` that needs to be generated. `ModelContainer` inherits from `ModelContainerBase`, and `ModelContainerBase`'s implementation lives in this file. See `model_container.h` for more details.
 
-All codegen templates can be found in `backend/main_templates.py`. The codegen implementation is in `backend/codegen.py`.
-
-Note that many of the headers in this directory rely on generated code and thus cannot be `#include`d in external projects. The exception is `model_interface.h`.
+All codegen templates can be found in `backend/main_templates.py`.
+The codegen implementation is in `backend/codegen.py`.
 
+Note that many of the headers in this directory rely on generated code and thus cannot be `#include` -d in external projects.
+`model_interface.h` is an exception.
diff --git a/docs/source/runtime/py_design.rst b/docs/source/runtime/py_design.rst
index c143123de..55093b8df 100644
--- a/docs/source/runtime/py_design.rst
+++ b/docs/source/runtime/py_design.rst
@@ -1,6 +1,6 @@
-=====================
+===================
 Python Runtime Note
-=====================
+===================
 
 Python `Model`
 ==============
@@ -16,7 +16,7 @@ This class represents a contiguous blob of memory that AIT will use as a tensor.
 * `shape: List[int]`: The shape of the tensor.
 * `dtype: str`: The tensor's dtype; one of `"float32", "float16", "int32", "int64"`. Note that most ops only support float16 at this stage.
 
-If using AITemplate with PyTorch, `AITData`s can be constructed with the `torch_to_ait_data` utility:
+When using AITemplate with PyTorch, `AITData` can be constructed with the `torch_to_ait_data` utility:
 
 .. code-block:: python
 
@@ -30,14 +30,14 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
 `run`
 -----
 
-`run` takes a set of inputs and outputs as `AITData`s. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
+`run` takes inputs and outputs as collections of `AITData` instances. Both arguments can be passed as either an ordered list or a dictionary (mapping name to tensor).
 
 .. code-block:: python
 
     # Arguments as a dictionary
     module.run(
       {"input0": in0_ait, "input1": in1_ait},
-      {"output0": out0_ait, "output1": out0_ait},
+      {"output0": out0_ait, "output1": out1_ait},
     )
 
     # Arguments as an ordered list. Note that you might need to query
@@ -45,8 +45,8 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
     input_name_to_idx = module.get_input_name_to_index_map()
     output_name_to_idx = module.get_output_name_to_index_map()
 
-    inputs = [None for i in range(len(input_name_to_idx))]
-    outputs = [None for i in range(len(input_name_to_idx))]
+    inputs = [None] * len(input_name_to_idx)
+    outputs = [None] * len(output_name_to_idx)
 
     for name in input_name_to_idx:
       inputs[input_name_to_idx[name]] = ait_inputs[name]
@@ -55,9 +55,9 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
       outputs[output_name_to_idx[name]] = ait_outputs[name]
 
     module.run(inputs, outputs)
-      
 
-One important caveat is that the output must be its **maximum** size. This is because of dynamic shapes - the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
+
+One important caveat is that the output must have the **maximum** possible size. This is because of dynamic shapes: the size of the output may vary, but its shape is not inferred until inference time. The maximum shape can be queried with the `get_output_maximum_shape()`:
 
 .. code-block:: python
 
@@ -67,7 +67,7 @@ One important caveat is that the output must be its **maximum** size. This is be
     max_shape = module.get_output_maximum_shape("output")
 
 
-`Model.run` returns a dictionary of output `AITData`s with (possibly dynamic) shapes that the runtime inferred.
+`Model.run` returns a dictionary of output `AITData` instances with (possibly dynamic) shapes that inferred in the runtime.
 
 Nullptr Inputs/Outputs
 ----------------------
@@ -102,7 +102,7 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
 `run_with_tensors`
 ------------------
 
-`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists of `torch.Tensor`s:
+`run_with_tensors` is a convenience method with the same interface as `run`, except it can take lists (or dicts) of `torch.Tensor` instances:
 
 .. code-block:: python
 
@@ -115,9 +115,14 @@ Constants are read-only and *shared* with all runtimes in the `ModelContainer`.
 Streams and Asynchronous Predictions
 ------------------------------------
 
-A pointer to a stream can optionally be passed to `run`. If none is given, the prediction happens on the default stream 0. If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns. `sync` is `True` by default.
+A pointer to a stream can optionally be passed to `run`.
+If none is given, the prediction happens on the default stream 0.
+If the `sync` argument is set to `True`, the stream is synchronized before `run()` returns.
+`sync` is `True` by default.
 
-Multiple predictions can happen at the same time (on the same or different streams). Under the hood, there is a fixed-size pool of runtime objects. When all the runtimes are used, `run()` blocks until one is available.
+Multiple predictions can happen at the same time (on the same or different streams).
+Under the hood, there is a fixed-size pool of runtime objects.
+When all the runtimes are used, `run()` blocks until one becomes available.
 The size of this pool can be configured with the `num_runtimes` option in `Model`'s constructor.
 
 CUDA Graph
diff --git a/docs/source/tutorial/how_to_add_op.rst b/docs/source/tutorial/how_to_add_op.rst
index 160745336..988f5375e 100644
--- a/docs/source/tutorial/how_to_add_op.rst
+++ b/docs/source/tutorial/how_to_add_op.rst
@@ -1,17 +1,17 @@
 How to add an operator to the AIT codegen
-========================================= 
+=========================================
 
 This tutorial will demonstrate how to add a new operator to the AIT codegen.
-Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`.
 
 
 0. Prerequisites
------------------
+----------------
 
-We need to import necessary Python modules
+We need to import necessary Python modules:
 
 .. code-block:: python
-  
+
   from typing import Any, Dict, List
 
   import jinja2
@@ -26,9 +26,9 @@ We need to import necessary Python modules
 
 
 1. Define the operator graph node
-----------------------------------
+---------------------------------
 
-Graph node is usually defined at `aitemplate/compiler/ops`.
+Graph nodes are usually defined at `aitemplate/compiler/ops`.
 
 .. code-block:: python
 
@@ -72,15 +72,15 @@ Graph node is usually defined at `aitemplate/compiler/ops`.
 .. note::
 
   - `_attrs` in Operator is the most important data structure for codegen.
-  - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in backend, must be **unique**.
+  - `_attrs["op"]` is the identity of operator category, which is used to find the corresponding codegen function in the backend; must be **unique**.
 
 2. Define the necessary templates for Codegen
-----------------------------------------------
+---------------------------------------------
 
 In AIT, there are 4 important templates for codegen:
 
 - `FUNC_TEMPLATE`: the template for generating the function body of the operator, and invoke GPU kernel in the body.
-- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defined name, and arguments of the function.
+- `FUNC_SIGNATURE_TEMPLATE`: the template for generating the function signature of the operator. The signature defines the name and arguments of the function.
 - `FUNC_CALL_TEMPLATE`: the template for generating the function call of the operator. The call will be used during inference to invoke the GPU kernel with given arguments.
 - `FUNC_DECL`: the template for forward declaration of the operator function. This is usually an alias of `FUNC_SIGNATURE_TEMPLATE`.
 
@@ -128,7 +128,7 @@ In AIT, there are 4 important templates for codegen:
   )
 
 3. Create the GPU kernels
---------------------------
+-------------------------
 
 In this example we use a simplest add one kernel. The kernel can be written by hand (as what programmer is expected to do), or generated by other tools.
 
@@ -166,10 +166,10 @@ In this example we use a simplest add one kernel. The kernel can be written by h
   )
 
 4. Define the codegen function
--------------------------------
+------------------------------
 
-The codegen function is the function that render the templates we defined into valid C++ code string.
-The codegen function will take `func_attrs` from graph node, and fill into the jinja2 template.
+The codegen function is the function that renders the templates we defined into valid C++ code string.
+The codegen function will take `func_attrs` from the graph node, and fill in the jinja2 template.
 
 .. code-block:: python
 
@@ -213,10 +213,10 @@ The codegen function will take `func_attrs` from graph node, and fill into the j
           ).strip()
     )
 
-5.1 Register the codegen function to CUDA backend
----------------------------------------------------
+5.1 Register the codegen function in CUDA backend
+-------------------------------------------------
 
-CUDA backend functions is usually defined at `aitemplate/backend/cuda/`.
+CUDA backend functions are usually defined at `aitemplate/backend/cuda/`.
 
 .. code-block:: python
 
@@ -240,10 +240,9 @@ CUDA backend functions is usually defined at `aitemplate/backend/cuda/`.
     return gen_function_call(func_attrs, indent, is_cuda=True)
 
 5.2 (Optional) Register the codegen function to ROCm backend
---------------------------------------------------------------
-
-ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
+------------------------------------------------------------
 
+ROCm backend functions are usually defined at `aitemplate/backend/rocm/`.
 
 .. code-block:: python
 
@@ -269,7 +268,7 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
 
 
 6. Compile and verify the results with PyTorch
-------------------------------------------------
+----------------------------------------------
 
 .. code-block:: python
 
@@ -299,4 +298,3 @@ ROCm backend functions is usually defined at `aitemplate/backend/rocm/`.
       outputs = {"Y": y}
       module.run_with_tensors(inputs, outputs)
       print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
-
diff --git a/docs/source/tutorial/how_to_infer_pt.rst b/docs/source/tutorial/how_to_infer_pt.rst
index 67891c46a..8b0535ce0 100644
--- a/docs/source/tutorial/how_to_infer_pt.rst
+++ b/docs/source/tutorial/how_to_infer_pt.rst
@@ -1,17 +1,15 @@
 How to inference a PyTorch model with AIT
-==========================================
+=========================================
 
 This tutorial will demonstrate how to inference a PyTorch model with AIT.
-Full source code can be founded at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`
+Full source code can be found at `examples/07_how_to_run_pt_model/how_to_run_pt_model.py`.
 
 0. Prerequisites
------------------
+----------------
 
-We need to import necessary Python modules
+We need to import necessary Python modules:
 
 .. code-block:: python
-  
-  from collections import OrderedDict
 
   import torch
 
@@ -23,9 +21,9 @@ We need to import necessary Python modules
 
 
 1. Define a PyTorch module
----------------------------
+--------------------------
 
-Here we define a PyTorch model which is commonly seen in Transformers.
+Here we define a PyTorch model which is commonly seen in Transformers:
 
 .. code-block:: python
 
@@ -46,7 +44,7 @@ Here we define a PyTorch model which is commonly seen in Transformers.
       return hidden_states
 
 2. Define an AIT module
-------------------------
+-----------------------
 
 We can define a similar AIT module as follows:
 
@@ -69,22 +67,23 @@ We can define a similar AIT module as follows:
 .. warning::
   The `nn.Module` API in AIT looks similar to PyTorch, but it is not the same.
 
-  The fundamental difference is that AIT module is a container to build graph, while PyTorch module is a container to store parameters for eager.
-  Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call. If you want to share parameters, needs to call `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch.
+  The fundamental difference is that AIT module is a container to build a graph, while PyTorch module is a container to store parameters for eager.
+  Which means, each AIT module's `forward` method can be only called once, and the graph is built during the first call.
+  If you want to share parameters, you need to use the `compiler.ops` instead. The `compiler.ops` is similar to `functional` in PyTorch.
+
+  AITemplate supports automatic fusion of linear followed by other operators. However in many cases, especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with the `fast_gelu` operator.
 
-  AITemplate supports automatically fusion on linear followed by other operators. However in many case especially for quick iterations, we use manual `specialization` to specify the fused operator. For example, `specialization="fast_gelu"` will fuse linear with `fast_gelu` operator.
-  
 3. Define a helper function to map PyTorch parameters to AIT parameters
--------------------------------------------------------------------------
+-----------------------------------------------------------------------
 
-In AIT, all names must follow C variable naming standard because the name will be used in codegen process.
+In AIT, all names must follow the C variable naming standard, because the names will be used in the codegen process.
 
 .. code-block:: python
 
   def map_pt_params(ait_model, pt_model):
     ait_model.name_parameter_tensor()
     pt_params = dict(pt_model.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_model.named_parameters():
       ait_name = name.replace(".", "_")
       assert name in pt_params
@@ -93,12 +92,12 @@ In AIT, all names must follow C variable naming standard because the name will b
 
 .. warning::
 
-  - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter a name with direct map to PyTorch.
-  - Because all names in AIT must follow C variable naming standard, you can easier replace `.` to `_` or use a regular expression to make sure the name in valid.
-  - For network with conv + bn subgraph, we currently haven't provide automatic pass to fold it. Refer our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding.
+  - Different to PyTorch, it is required to call ait_model **.name_parameter_tensor()** method to provide each parameter with a name with a direct map to PyTorch.
+  - Because all names in AIT must follow the C variable naming standard, you can easily replace `.` by `_` or use a regular expression to make sure the name in valid.
+  - For networks with conv + bn subgraph, we currently don't provide an automatic pass to fold it. Please refer to our ResNet and Detectron2 examples to see how we handle CNN layout transform and BatchNorm folding.
 
 4. Create PyTorch module, inputs/outputs
------------------------------------------
+----------------------------------------
 
 .. code-block:: python
 
@@ -115,7 +114,7 @@ In AIT, all names must follow C variable naming standard because the name will b
   y_pt = pt_model(x)
 
 5. Create AIT module, inputs/outputs
--------------------------------------
+------------------------------------
 
 .. code-block:: python
 
@@ -139,12 +138,12 @@ In AIT, all names must follow C variable naming standard because the name will b
 .. warning::
 
   - Similar to MetaTensor, LazyTensor and a lot of other lazy evaluation frameworks, AIT's Tensor records the computation graph, and the graph is built when the Tensor is compiled.
-  - For input tensor, it is required to set the attribute **is_input=True**
-  - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**
-  - For input and output tensors, it is better to provide **name** attributes to use in runtime
+  - For input tensor, it is required to set the attribute **is_input=True**.
+  - For output tensor, it is required to set the attribute **Y._attrs["is_output"] = True**.
+  - For input and output tensors, it is better to provide the **name** attributes to use in runtime.
 
-6. Compile AIT module in to runtime, and do verification
---------------------------------------------------------
+6. Compile AIT module into runtime and do verification
+------------------------------------------------------
 
 .. code-block:: python
 
@@ -180,9 +179,9 @@ In AIT, all names must follow C variable naming standard because the name will b
     print(f"PyTorch eager time: {pt_t} ms/iter")
 
 
-In this example, AIT will automatically fuse GELU and elementwise add into TensorCore/MatrixCore gemm operation. On RTX-3080 for this example, AIT is about 1.15X fast than PyTorch Eager in this example.
+In this example, AIT will automatically fuse GELU and elementwise addition into the TensorCore/MatrixCore gemm operation. On RTX-3080, in the example AIT is about 1.15X faster than PyTorch Eager.
 
 .. note::
 
-  - In this example, we fold parameters (weights) into AIT runtime, which the final dynamic library will contains parameters.
-  - If during compile we don't provide parameters, for example the total parameters size is greater than 2GB, we can always call `set_constant` function in runtime. Check runtime API for details.
\ No newline at end of file
+  - In this example, we fold the parameters (`weights`) into AIT runtime. The final dynamic library will contain them as parameters.
+  - If during the compile time we don't provide the parameters (for example, because the total parameters size is greater than 2GB), we can always call `set_constant` function in the runtime. Please check the runtime API for the details.
diff --git a/docs/source/tutorial/how_to_visualize.rst b/docs/source/tutorial/how_to_visualize.rst
index 5af7c89a5..1b6856699 100644
--- a/docs/source/tutorial/how_to_visualize.rst
+++ b/docs/source/tutorial/how_to_visualize.rst
@@ -1,5 +1,5 @@
 How to visualize an AIT model
-==============================
+=============================
 
 Visualization is important for understanding the behavior of a model optimization.
 In AIT, we modify the codegen a little bit, from generating CUDA/HIP C++ code to HTML/Javascript code,
@@ -9,7 +9,7 @@ then we can generate a visualization of the model.
 The following code will generate a visualization of our first example.
 
 1. Define the AIT Model
-------------------------
+-----------------------
 
 .. code-block:: python
 
@@ -71,15 +71,15 @@ The following code will generate a visualization of our first example.
   graph = apply_optimizations(output_tensor)
 
 3. Generate visualization
---------------------------
+-------------------------
 
 .. code-block:: python
 
   # Plot the graph
-  plot_graph(graph, file_path="ait_model.html", network_name="ait_sample_net")
+  plot_graph(graph, file_path="ait_model.html")
 
 The visualization will be generated in the "ait_model.html" file. This file can be opened in Chrome without any web server.
 
 .. raw:: html
 
-  <iframe src="ait_model.html" width="100%" height="600px"></iframe>
\ No newline at end of file
+  <iframe src="ait_model.html" width="100%" height="600px"></iframe>
diff --git a/docs/static/ait_model.html b/docs/static/ait_model.html
index 18c56089d..3f414b67b 100644
--- a/docs/static/ait_model.html
+++ b/docs/static/ait_model.html
@@ -4,7 +4,7 @@
   <meta charset="utf-8">
   <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
   <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.3.1/dist/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
-  <title>ait_sample_net</title>
+  <title>ait_model</title>
 </head>
 
 <style>
@@ -83,7 +83,7 @@
 
 <nav id="nav_bar" class="navbar fixed-top bg-light">
   <div class="container-fluid">
-    <a onclick="back_to_head()" class="navbar-brand">ait_sample_net</a>
+    <a onclick="back_to_head()" class="navbar-brand">ait_model</a>
     <div class="navbar-right">
         <div class="autocomplete" style="width:300px;">
         <input id="name_input" class="form-control me-2" type="search" placeholder="Search" aria-label="Search">
@@ -98,7 +98,7 @@
   src="https://code.jquery.com/jquery-3.6.0.js"
   integrity="sha256-H+K7U5CnXl1h5ywQfKtSj8PCmoN9aaq30gDh27Xc0jk="
   crossorigin="anonymous"></script>
-  
+
   <script src="https://cdn.jsdelivr.net/npm/popper.js@1.12.9/dist/umd/popper.min.js" integrity="sha384-ApNbgh9B+Y1QKtv3Rn7W3mgPxhU9K/ScQsAP7hUibX39j7fakFPskvXusvfa0b4Q" crossorigin="anonymous"></script>
   <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
   <script src="https://d3js.org/d3.v5.min.js"></script>
@@ -108,7 +108,7 @@
 
 
   <div id="graph" style="text-align: center;"></div>
-  
+
 <div class="modal fade" id="X_modal" tabindex="-1" role="dialog" aria-labelledby="X_label" aria-hidden="true">
   <div class="modal-dialog" role="document">
     <div class="modal-content">
@@ -119,7 +119,7 @@ <h5 class="modal-title" id="X_label">X</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -162,7 +162,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_label">gemm_rcr_bias_fast_
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -193,7 +193,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_add_4_label">gemm_rcr_bias_add_4</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -224,7 +224,7 @@ <h5 class="modal-title" id="dense1_weight_label">dense1_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -267,7 +267,7 @@ <h5 class="modal-title" id="dense1_bias_label">dense1_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -310,7 +310,7 @@ <h5 class="modal-title" id="gemm_rcr_bias_fast_gelu_0_0_label">gemm_rcr_bias_fas
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -353,7 +353,7 @@ <h5 class="modal-title" id="dense2_weight_label">dense2_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -396,7 +396,7 @@ <h5 class="modal-title" id="dense2_bias_label">dense2_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -439,7 +439,7 @@ <h5 class="modal-title" id="elementwise_2_0_label">elementwise_2_0</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -482,7 +482,7 @@ <h5 class="modal-title" id="layernorm_3_label">layernorm_3</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -513,7 +513,7 @@ <h5 class="modal-title" id="layernorm_weight_label">layernorm_weight</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -556,7 +556,7 @@ <h5 class="modal-title" id="layernorm_bias_label">layernorm_bias</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -599,7 +599,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         </button>
       </div>
       <div class="modal-body">
-        
+
 <table class="table">
   <thead class="thead-dark">
     <tr>
@@ -631,7 +631,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
     </div>
   </div>
 </div>
-  
+
 
   <script>
   items = ["X", "gemm_rcr_bias_fast_gelu_0", "gemm_rcr_bias_add_4", "dense1_weight", "dense1_bias", "gemm_rcr_bias_fast_gelu_0_0", "dense2_weight", "dense2_bias", "elementwise_2_0", "layernorm_3", "layernorm_weight", "layernorm_bias", "Y"];
@@ -661,7 +661,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
           /*make the matching letters bold:*/
           b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
           b.innerHTML += arr[i].substr(val.length);
-          /*insert a input field that will hold the current array item's value:*/
+          /*insert an input field that will hold the current array item's value:*/
           b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
           /*execute a function when someone clicks on the item value (DIV element):*/
               b.addEventListener("click", function(e) {
@@ -798,7 +798,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
 layernorm_3 -> Y;
 }
 `;
-    var popover_data = {"X": "shape: [512, 1024]", "gemm_rcr_bias_fast_gelu_0": "op: gemm_rcr_bias_fast_gelu", "gemm_rcr_bias_add_4": "op: gemm_rcr_bias_add", "dense1_weight": "shape: [4096, 1024]", "dense1_bias": "shape: [4096]", "gemm_rcr_bias_fast_gelu_0_0": "shape: [512, 4096]", "dense2_weight": "shape: [1024, 4096]", "dense2_bias": "shape: [1024]", "elementwise_2_0": "shape: [512, 1024]", "layernorm_3": "op: layernorm", "layernorm_weight": "shape: [1024]", "layernorm_bias": "shape: [1024]", "Y": "shape: [512, 1024]"};
+    var popover_data = {"X": "shape: [512, 1024]", "gemm_rcr_bias_fast_gelu_0": "op_type: gemm_rcr_bias_fast_gelu", "gemm_rcr_bias_add_4": "op_type: gemm_rcr_bias_add", "dense1_weight": "shape: [4096, 1024]", "dense1_bias": "shape: [4096]", "gemm_rcr_bias_fast_gelu_0_0": "shape: [512, 4096]", "dense2_weight": "shape: [1024, 4096]", "dense2_bias": "shape: [1024]", "elementwise_2_0": "shape: [512, 1024]", "layernorm_3": "op_type: layernorm", "layernorm_weight": "shape: [1024]", "layernorm_bias": "shape: [1024]", "Y": "shape: [512, 1024]"};
     var graphviz = d3.select("#graph").graphviz();
     var pop_finish = 0;
     // var dotSrcLines;
@@ -815,7 +815,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         obj.popover();
       }
     }
-  
+
 
     function render() {
       // console.log('DOT source =', dotSrc);
@@ -849,7 +849,7 @@ <h5 class="modal-title" id="Y_label">Y</h5>
         var shape = obj.find("polygon:first");
         var color = shape.attr("stroke");
         shape.attr("fill", color);
-        
+
       });
       nodes.on("mouseout", function() {
         var id = d3.select(this).attr("id");
@@ -862,5 +862,5 @@ <h5 class="modal-title" id="Y_label">Y</h5>
     }
     render(dotSrc);
   </script>
-  
-</body>
\ No newline at end of file
+
+</body>
diff --git a/examples/01_resnet-50/benchmark_ait.py b/examples/01_resnet-50/benchmark_ait.py
index 577a4472d..3e84681c9 100644
--- a/examples/01_resnet-50/benchmark_ait.py
+++ b/examples/01_resnet-50/benchmark_ait.py
@@ -76,8 +76,8 @@ def benchmark(model_name, batch_size, mod=None, graph_mode=True):
         mod = Model(os.path.join("./tmp", model_name, "test.so"))
 
     # Set params
-    for k, v in cuda_params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(cuda_params)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     x_input = torch.randn([batch_size, 224, 224, 3]).cuda().half()
diff --git a/examples/01_resnet-50/infer_with_torch.py b/examples/01_resnet-50/infer_with_torch.py
index 23269b2e4..5639897a8 100644
--- a/examples/01_resnet-50/infer_with_torch.py
+++ b/examples/01_resnet-50/infer_with_torch.py
@@ -98,8 +98,8 @@ def inference(model_name, mod=None):
         mod = Model(os.path.join("./tmp", model_name, "test.so"))
 
     # Set torch tensor params to runtime
-    for k, v in cuda_params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(cuda_params)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     x_input = prepare_data()
diff --git a/examples/01_resnet-50/test_correctness.py b/examples/01_resnet-50/test_correctness.py
new file mode 100644
index 000000000..4dcd8edea
--- /dev/null
+++ b/examples/01_resnet-50/test_correctness.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target
+
+from modeling.resnet import build_resnet_backbone
+from weight_utils import timm_export
+
+
+def mark_output(y):
+    """Different to PyTorch, we need to explicit mark output tensor for optimization,
+
+    Parameters
+    ----------
+    y : List[Tensor]
+        List of output tensors
+    """
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+class ResNet50Verification(unittest.TestCase):
+    def test_resnet50(self):
+        target = detect_target()
+        batch_size = 1
+        torch_dtype = torch.float16
+        ait_dtype = "float16"
+        # Create input tensor, need to specify the shape, dtype and is_input flag
+        x = Tensor(
+            shape=[batch_size, 224, 224, 3],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        model = build_resnet_backbone(50, activation="ReLU")
+        # Mark all parameters with name same to PyTorch name convention
+        model.name_parameter_tensor()
+        # Forward the input tensor to the model, get output tensor
+        y = model(x)
+        # Mark output tensor
+        mark_output(y)
+
+        timm_exporter = timm_export("resnet50", pretrained=False)
+        ait_params = timm_exporter.export_model(half=torch_dtype == torch.float16)
+        pt_model = timm_exporter.pt_model.to(dtype=torch_dtype, device="cuda")
+        pt_model.eval()
+        module = compile_model(y, target, "./tmp", "resnet50")
+        for name, param in ait_params.items():
+            module.set_constant_with_tensor(name, param)
+
+        # ait model expects NHWC format
+        x_ait = torch.rand([batch_size, 224, 224, 3], dtype=torch_dtype, device="cuda")
+        # center the input wrt the training data for numerical stability
+        x_ait -= torch.tensor([0.485, 0.456, 0.406]).cuda()
+        x_ait /= torch.tensor([0.229, 0.224, 0.225]).cuda()
+        # torch model expects NCHW format
+        x_pt = torch.transpose(x_ait, 1, 3).contiguous()
+        with torch.no_grad():
+            y_pt = pt_model(x_pt)
+        y_ait = torch.zeros([batch_size, 1, 1, 1000], dtype=torch_dtype, device="cuda")
+        module.run_with_tensors([x_ait], [y_ait])
+
+        torch.testing.assert_close(
+            y_pt, y_ait.reshape([batch_size, 1000]), rtol=1e-1, atol=1e-1
+        )
+
+
+if __name__ == "__main__":
+    torch.cuda.manual_seed(0)
+    unittest.main()
diff --git a/examples/01_resnet-50/weight_utils.py b/examples/01_resnet-50/weight_utils.py
index beaebd330..40dc53e74 100644
--- a/examples/01_resnet-50/weight_utils.py
+++ b/examples/01_resnet-50/weight_utils.py
@@ -30,15 +30,15 @@
 CONV_WEIGHT_PATTERN = re.compile(r"conv\d+\.weight")
 
 
-class timm_export(object):
-    def __init__(self, model_name):
+class timm_export:
+    def __init__(self, model_name, pretrained=True):
         self.model_name = model_name
         if model_name != "resnet50":
             raise NotImplementedError
 
         with torch.no_grad():
             self.pt_model = timm.create_model(
-                model_name, pretrained=True, num_classes=1000
+                model_name, pretrained=pretrained, num_classes=1000
             )
         self.pt_state = self.pt_model.state_dict()
 
@@ -62,16 +62,11 @@ def fuse_conv_bn_weights(
         conv_w = torch.tensor(conv_w)
         bn_rm = torch.tensor(bn_rm)
         bn_rv = torch.tensor(bn_rv)
-        bn_w = torch.tensor(bn_w)
-        bn_b = torch.tensor(bn_b)
+        conv_b = torch.tensor(conv_b) if conv_b is not None else torch.zeros_like(bn_rm)
+        bn_w = torch.tensor(bn_w) if bn_w is not None else torch.ones_like(bn_rm)
+        bn_b = torch.tensor(bn_b) if bn_b is not None else torch.zeros_like(bn_rm)
         bn_eps = torch.tensor(bn_eps)
 
-        if conv_b is None:
-            conv_b = torch.zeros_like(bn_rm)
-        if bn_w is None:
-            bn_w = torch.ones_like(bn_rm)
-        if bn_b is None:
-            bn_b = torch.zeros_like(bn_rm)
         bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
         if transpose:
diff --git a/examples/02_detectron2/compile_model.py b/examples/02_detectron2/compile_model.py
index 4bf5d4d25..58d97924f 100644
--- a/examples/02_detectron2/compile_model.py
+++ b/examples/02_detectron2/compile_model.py
@@ -99,6 +99,7 @@ def benchmark(cfg, mod=None):
         mask_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
         outputs.append(torch.empty([BS, topk, mask_size, mask_size]).cuda().half())
 
+    mod.fold_constants(sync=True)
     mod.benchmark_with_tensors([x], outputs, count=100, repeat=2, graph_mode=True)
 
 
diff --git a/examples/02_detectron2/modeling/proposal_generator/rpn.py b/examples/02_detectron2/modeling/proposal_generator/rpn.py
index ce7a0f2bc..545105936 100644
--- a/examples/02_detectron2/modeling/proposal_generator/rpn.py
+++ b/examples/02_detectron2/modeling/proposal_generator/rpn.py
@@ -123,7 +123,7 @@ def forward(self, features):
         for rois, logit in zip(pred_rois, pred_logits):
             rois = ops.reshape()(rois, [N, -1, 4])
             if self.topk > 0 and rois.shape()[1].value() > self.topk:
-                score_inds = ops.topk(k=self.topk)(ops.reshape()(logit, [N, -1]))
+                _, score_inds = ops.topk(k=self.topk)(ops.reshape()(logit, [N, -1]))
                 boxes_topk = ops.batch_gather()(rois, score_inds)
                 scores_topk = ops.batch_gather()(
                     ops.reshape()(logit, [N, -1, 1]), score_inds
diff --git a/examples/02_detectron2/predictor/predictor.py b/examples/02_detectron2/predictor/predictor.py
index 324a138c2..ce3f85f24 100644
--- a/examples/02_detectron2/predictor/predictor.py
+++ b/examples/02_detectron2/predictor/predictor.py
@@ -176,9 +176,8 @@ def init_modules(self, detection_model_name, workdir):
         Load the AIT module of the detection model, and set the weights.
         """
         mod = Model(os.path.join(workdir, detection_model_name, "test.so"))
-        for name, weight in self.weights.items():
-            mod.set_constant_with_tensor(name, weight)
-
+        mod.set_many_constants_with_tensors(self.weights)
+        mod.fold_constants(sync=True)
         return mod
 
     def run_batch(self, batch_data, graph_mode=False):
diff --git a/examples/02_detectron2/test_correctness.py b/examples/02_detectron2/test_correctness.py
new file mode 100644
index 000000000..52d6174c2
--- /dev/null
+++ b/examples/02_detectron2/test_correctness.py
@@ -0,0 +1,387 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import functools
+import logging
+import os
+import unittest
+
+import cv2
+import numpy as np
+
+import torch
+
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from detectron2.config import CfgNode
+from detectron2.engine import DefaultPredictor
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+    import requests
+
+from detectron2.model_zoo import get_checkpoint_url
+from parameterized import parameterized
+from PIL import Image
+
+from .configs.config import get_cfg_defaults
+from .modeling.meta_arch import GeneralizedRCNN
+from .tools.convert_pt2ait import detectron2_export
+
+logger = logging.getLogger(__name__)
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def extract_params_meta(ait_model):
+    ret = []
+    for name, p in ait_model.named_parameters():
+        name = name.replace(".", "_")
+        shape = [x._attrs["values"][0] for x in p.tensor()._attrs["shape"]]
+        ret.append([name, shape])
+    return ret
+
+
+def get_output_shape(oldh: int, oldw: int, short_edge_length: int, max_size: int):
+    """
+    Compute the output size given input size and target short edge length.
+    """
+    h, w = oldh, oldw
+    size = short_edge_length * 1.0
+    scale = size / min(h, w)
+    if h < w:
+        newh, neww = size, scale * w
+    else:
+        newh, neww = scale * h, size
+    if max(newh, neww) > max_size:
+        scale = max_size * 1.0 / max(newh, neww)
+        newh = newh * scale
+        neww = neww * scale
+    neww = int(neww + 0.5)
+    newh = int(newh + 0.5)
+    return (newh, neww)
+
+
+def apply_transform(cfg, img):
+    """
+    Resize the image while keeping the aspect ratio unchanged.
+    It attempts to scale the shorter edge to the given `short_edge_length`,
+    as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+    h, w = img.shape[:2]
+    new_h, new_w = get_output_shape(
+        h, w, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST
+    )
+    if len(img.shape) > 2 and img.shape[2] == 1:
+        pil_image = Image.fromarray(img[:, :, 0], mode="L")
+    else:
+        pil_image = Image.fromarray(img)
+    pil_image = pil_image.resize((new_w, new_h), Image.BILINEAR)
+    ret = np.asarray(pil_image)
+    if len(img.shape) > 2 and img.shape[2] == 1:
+        ret = np.expand_dims(ret, -1)
+    return ret
+
+
+def preprocess(cfg, ori_img, pad_value: float = 0.0, dtype="float16"):
+    """
+    Image preprocess: resize the image (see `apply_transform`), normalize the pixels,
+    and add padding.
+    """
+    # HH, WW = self.im_shape
+    ori_shape = ori_img.shape
+    if ori_shape[0] > ori_shape[1]:
+        img = np.rot90(ori_img, k=1)
+    else:
+        img = ori_img
+    inputs = apply_transform(cfg, img)
+    resize_scale = img.shape[0] / inputs.shape[0]
+    pixel_mean = np.array(cfg.MODEL.PIXEL_MEAN).reshape(1, 1, -1)
+    pixel_std = np.array(cfg.MODEL.PIXEL_STD).reshape(1, 1, -1)
+    inputs = (inputs - pixel_mean) / pixel_std
+    padding_size = (
+        (0, cfg.INPUT.MIN_SIZE_TEST - inputs.shape[0]),
+        (0, cfg.INPUT.MAX_SIZE_TEST - inputs.shape[1]),
+        (0, 0),
+    )
+    inputs = np.pad(inputs, padding_size, constant_values=pad_value)
+    inputs = inputs[np.newaxis, :]
+    return inputs.astype(dtype), ori_img, ori_shape, resize_scale
+
+
+def apply_bbox(bbox, im_w, im_h):
+    if im_h > im_w:
+        x0 = bbox[:, 0][..., np.newaxis]
+        y0 = bbox[:, 1][..., np.newaxis]
+        x1 = bbox[:, 2][..., np.newaxis]
+        y1 = bbox[:, 3][..., np.newaxis]
+        bbox = np.hstack((im_w - y1, x0, im_w - y0, x1))
+    return bbox
+
+
+def postprocess_ait_results(
+    ret,
+    mask_on,
+    batch_size,
+    score_thresh,
+    images,
+    image_list,
+    image_shapes,
+    image_scales,
+):
+    batched_boxes, batched_scores, batched_classes = ret[1:4]
+    if mask_on:
+        batched_masks = ret[-1]
+    results = {}
+    for i in range(batch_size):
+        boxes, scores, classes = (
+            batched_boxes[i, :],
+            batched_scores[i, :],
+            batched_classes[i, :],
+        )
+
+        filter_inds = (scores > score_thresh).nonzero().squeeze()
+        scores = scores[filter_inds]
+        boxes = boxes[filter_inds, :] * image_scales[i]
+        boxes = apply_bbox(boxes, image_shapes[i][1], image_shapes[i][0])
+        classes = classes[filter_inds]
+
+        results[image_list[i]] = {
+            "boxes": boxes,
+            "scores": scores,
+            "classes": classes,
+            "image_height": image_shapes[i][0],
+            "image_width": image_shapes[i][1],
+            "num_instances": boxes.shape[0],
+            "image": images[i],
+        }
+        if mask_on:
+            mask_pred = batched_masks[i, filter_inds, :, :]
+            im_height, im_width = image_shapes[i][:2]
+            masks = []
+            for pred_box, mask in zip(
+                boxes,
+                mask_pred,
+            ):
+                mask = mask.cpu().numpy().astype(np.float32)
+                if im_height > im_width:
+                    mask = np.rot90(mask, k=-1)
+                box = pred_box.cpu().numpy().astype("int")
+                det_width = box[2] - box[0]
+                det_height = box[3] - box[1]
+                small_mask = Image.fromarray(mask)
+                mask = small_mask.resize(
+                    (det_width, det_height), resample=Image.BILINEAR
+                )
+                mask = np.array(mask, copy=False)
+                MASK_THRESHOLD = 0.5
+                mask = np.array(mask > MASK_THRESHOLD, dtype=np.uint8)
+                padded_mask = np.zeros((im_height, im_width), dtype=np.uint8)
+                x_0 = max(box[0], 0)
+                x_1 = min(box[2], im_width)
+                y_0 = max(box[1], 0)
+                y_1 = min(box[3], im_height)
+                padded_mask[y_0:y_1, x_0:x_1] = mask[
+                    (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+                ]
+                masks.append(padded_mask)
+            results[image_list[i]]["masks"] = torch.tensor(masks)
+    return results
+
+
+class Detectron2Verification(unittest.TestCase):
+    @parameterized.expand(
+        ["faster_rcnn_R_50", "faster_rcnn_R_101", "mask_rcnn_R_50", "mask_rcnn_R_101"]
+    )
+    def test_detectron2(self, config):
+        cfg = get_cfg_defaults()
+        cfg.merge_from_file(
+            os.path.join(os.path.dirname(__file__), "configs", f"{config}_FPN.yaml")
+        )
+        cfg.SOLVER.IMS_PER_BATCH = 1
+        cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8
+        cfg.freeze()
+
+        ait_dtype = "float16"
+        torch_dtype = torch.float16
+
+        model = GeneralizedRCNN(cfg)
+        model.name_parameter_tensor()
+
+        x = Tensor(
+            shape=[
+                cfg.SOLVER.IMS_PER_BATCH,
+                cfg.INPUT.MIN_SIZE_TEST,
+                cfg.INPUT.MAX_SIZE_TEST,
+                3,
+            ],
+            dtype=ait_dtype,
+            name="input_0",
+            is_input=True,
+        )
+        y = model(x)
+        mark_output(y)
+
+        checkpoint_path = f"/tmp/detectron2/{config}_FPN_3x.pkl"
+        sample_input_filename = "000000001268.jpg"
+        sample_input_path = f"/tmp/detectron2/{sample_input_filename}"
+
+        torch_cfg = CfgNode(cfg)
+        torch_cfg.MODEL.WEIGHTS = checkpoint_path
+        if not os.path.exists(checkpoint_path):
+            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client("glow_test_data") as client:
+                    await_sync(
+                        client.get(
+                            f"tree/aitemplate/detectron2/pickles/{config}_FPN_3x.pkl",
+                            checkpoint_path,
+                        )
+                    )
+            else:
+                torch_cfg.MODEL.WEIGHTS = get_checkpoint_url(
+                    f"COCO-{'InstanceSegmentation' if 'mask' in config else 'Detection'}/{config}_FPN_3x.yaml"
+                )
+
+        torch_predictor = DefaultPredictor(torch_cfg)
+
+        if not os.path.exists(sample_input_path):
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client("aitemplate") as client:
+                    await_sync(
+                        client.get(
+                            f"tree/detectron2/datasets/coco/val2017/{sample_input_filename}",
+                            sample_input_path,
+                        )
+                    )
+            else:
+                img_url = (
+                    f"http://images.cocodataset.org/val2017/{sample_input_filename}"
+                )
+                img_data = requests.get(img_url).content
+                with open(sample_input_path, "wb") as f:
+                    f.write(img_data)
+
+        sample_img = cv2.imread(sample_input_path)
+        sample_input, original_image, shape, scale = preprocess(
+            cfg, sample_img, dtype=ait_dtype
+        )
+        x_ait = torch.tensor(sample_input).cuda()
+
+        with torch.no_grad():
+            ait_params = detectron2_export("").export_model(
+                {
+                    k: v.cpu().numpy()
+                    for k, v in torch_predictor.model.state_dict().items()
+                },
+                extract_params_meta(model),
+            )
+            pt_instance = torch_predictor(sample_img)["instances"]
+
+        ait_module = compile_model(y, detect_target(), "./tmp", cfg.MODEL.NAME)
+        for name, param in ait_params.items():
+            ait_module.set_constant_with_tensor(
+                name, param.contiguous().to(dtype=torch_dtype).cuda()
+            )
+        model.set_anchors(ait_module)
+        topk = cfg.POSTPROCESS.TOPK
+        BS = cfg.SOLVER.IMS_PER_BATCH
+        outputs = [
+            torch.empty([BS, 1], dtype=torch.int64).cuda(),
+            torch.empty([BS, topk, 4], dtype=torch_dtype).cuda(),
+            torch.empty([BS, topk], dtype=torch_dtype).cuda(),
+            torch.empty([BS, topk], dtype=torch.int64).cuda(),
+        ]
+        if cfg.MODEL.MASK_ON:
+            mask_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION * 2
+            outputs.append(
+                torch.empty([BS, topk, mask_size, mask_size], dtype=torch_dtype).cuda()
+            )
+
+        ait_module.run_with_tensors([x_ait], outputs)
+
+        ait_results = postprocess_ait_results(
+            outputs,
+            cfg.MODEL.MASK_ON,
+            BS,
+            cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST,
+            [original_image],
+            [sample_input_path],
+            [shape],
+            [scale],
+        )
+
+        result = ait_results[sample_input_path]
+
+        compare_floats = functools.partial(
+            torch.testing.assert_close, atol=1e-1, rtol=1e-1
+        )
+        compare_ints = functools.partial(torch.testing.assert_close, atol=0, rtol=0)
+
+        compare_ints(len(pt_instance), result["num_instances"])
+
+        # Boxes precision is tricky.
+        # Practically, these are pixel values, so any difference around 1e0 can be disregarded
+        compare_boxes_floats = functools.partial(
+            torch.testing.assert_close, atol=5e-0, rtol=1e-1
+        )
+        # Keep in mind that we are comparing sets here,
+        # not lists because all items are sorted by score and
+        # a small difference in score can result in a wrong items order.
+        # We do our best to estabilish 1:1 mapping for comparison
+        pt_boxes = pt_instance.pred_boxes.tensor.to(dtype=result["boxes"].dtype).sort(
+            dim=0
+        )
+        ait_boxes = result["boxes"].sort(dim=0)
+        compare_boxes_floats(
+            ait_boxes,
+            pt_boxes,
+        )
+        compare_floats(
+            pt_instance.scores.to(dtype=result["scores"].dtype),
+            result["scores"],
+        )
+        # also comparing sets
+        compare_ints(
+            pt_instance.pred_classes.sort().values, result["classes"].sort().values
+        )
+        # homebrew similarity match between boolean arrays
+        if cfg.MODEL.MASK_ON:
+            pt_masks = pt_instance.pred_masks.to(
+                dtype=result["masks"].dtype, device="cpu"
+            )
+            ait_masks = result["masks"]
+            self.assertLess(
+                (pt_masks != ait_masks).sum() / (pt_masks == ait_masks).sum(), 1e-2
+            )
+
+
+if __name__ == "__main__":
+    torch.cuda.manual_seed(1337)
+    unittest.main()
diff --git a/examples/02_detectron2/tools/convert_pt2ait.py b/examples/02_detectron2/tools/convert_pt2ait.py
index 584e14560..85b66c48c 100644
--- a/examples/02_detectron2/tools/convert_pt2ait.py
+++ b/examples/02_detectron2/tools/convert_pt2ait.py
@@ -33,7 +33,7 @@ class detectron2_export:
     def __init__(self, model_name):
         self.model_name = model_name
 
-    def export_model(self, model):
+    def export_model(self, model, ait_param_map=None):
         fuse_model = {}
         bn_keys = set()
         for k, _ in model.items():
@@ -55,12 +55,13 @@ def export_model(self, model):
         if detect_target().name() == "cuda":
             self.export_conv0(ait_model, fuse_model)
 
-        self.check_model(ait_model)
+        self.check_model(ait_model, ait_param_map)
         return ait_model
 
-    def check_model(self, ait_model):
-        with open(os.path.join("./tmp", self.model_name, "params.json")) as fi:
-            param_map = json.load(fi)
+    def check_model(self, ait_model, param_map=None):
+        if param_map is None:
+            with open(os.path.join("./tmp", self.model_name, "params.json")) as fi:
+                param_map = json.load(fi)
         for name, shape in param_map:
             assert ait_model[name].shape == tuple(
                 shape
@@ -74,16 +75,11 @@ def fuse_conv_bn_weights(
         conv_w = torch.tensor(conv_w)
         bn_rm = torch.tensor(bn_rm)
         bn_rv = torch.tensor(bn_rv)
-        bn_w = torch.tensor(bn_w)
-        bn_b = torch.tensor(bn_b)
+        conv_b = torch.tensor(conv_b) if conv_b is not None else torch.zeros_like(bn_rm)
+        bn_w = torch.tensor(bn_w) if bn_w is not None else torch.ones_like(bn_rm)
+        bn_b = torch.tensor(bn_b) if bn_b is not None else torch.zeros_like(bn_rm)
         bn_eps = torch.tensor(bn_eps)
 
-        if conv_b is None:
-            conv_b = torch.zeros_like(bn_rm)
-        if bn_w is None:
-            bn_w = torch.ones_like(bn_rm)
-        if bn_b is None:
-            bn_b = torch.zeros_like(bn_rm)
         bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
 
         if transpose:
diff --git a/examples/03_bert/benchmark_ait.py b/examples/03_bert/benchmark_ait.py
index ef0a11219..e40132d09 100644
--- a/examples/03_bert/benchmark_ait.py
+++ b/examples/03_bert/benchmark_ait.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 #
 import os
-from collections import OrderedDict
 
 from typing import Dict, List
 
@@ -25,8 +24,8 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-from modeling.bert import BertBaseEncodersOnly, BertBaseUncased
-from modeling.torch_model import BertBaseUncased as BertPt
+from .modeling.bert import BertBaseEncodersOnly, BertBaseUncased
+from .modeling.torch_model import BertBaseUncased as BertPt
 
 
 def mark_output(y: Tensor) -> None:
@@ -113,7 +112,7 @@ def map_pt_params(
     ait_bert, pt_bert, batch_size: int, seq_length: int
 ) -> Dict[str, torch.Tensor]:
     pt_params = dict(pt_bert.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_bert.named_parameters():
         ait_name = name.replace(".", "_")
         if name in pt_params:
@@ -223,39 +222,12 @@ def compile_module(
 
     mod = compile_model(y, target, "./tmp", model_name)
 
-    for k, v in params.items():
-        mod.set_constant_with_tensor(k, v)
+    mod.set_many_constants_with_tensors(params)
+    mod.fold_constants(sync=True)
 
     return mod
 
 
-def load_module(
-    batch_size: int,
-    seq_length: int,
-    hidden_size: int,
-    activation: str,
-    use_fp16_acc: bool,
-    encoders_only: bool,
-    pt_model: torch.nn.Module,
-) -> None:
-    model_name = f"BERT_{activation}_{batch_size}_{seq_length}"
-
-    if encoders_only:
-        model = BertBaseEncodersOnly(batch_size, seq_length, hidden_act=activation)
-    else:
-        model = BertBaseUncased(batch_size, seq_length, hidden_act=activation)
-
-    # Mark all parameters with name same to PyTorch name convention
-    model.name_parameter_tensor()
-
-    params = map_pt_params(model, pt_model, batch_size, seq_length)
-
-    mod = Model(os.path.join("./tmp", model_name, "test.so"))
-
-    for k, v in params.items():
-        mod.set_constant_with_tensor(k, v)
-
-    return mod
 
 
 @click.command()
@@ -310,19 +282,6 @@ def compile_and_benchmark(
     pt_model.eval()
     hidden_size = pt_model.config.hidden_size
 
-    if batch_size >= 1 and seq_length >= 1:
-        mod = load_module(
-            batch_size,
-            seq_length,
-            hidden_size,
-            activation,
-            use_fp16_acc,
-            encoders_only,
-            pt_model,
-        )
-        benchmark(batch_size, seq_length, hidden_size, mod, graph_mode, encoders_only)
-        return
-
     if batch_size < 1:
         batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
     else:
diff --git a/examples/03_bert/benchmark_mi250.sh b/examples/03_bert/benchmark_mi250.sh
index ac6be56a6..4bacb3407 100644
--- a/examples/03_bert/benchmark_mi250.sh
+++ b/examples/03_bert/benchmark_mi250.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #profile
-#HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
+HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 benchmark_ait.py
 
 #1GCD
 HIP_VISIBLE_DEVICES=0 python3 benchmark_ait.py --batch-size "$1"
diff --git a/examples/03_bert/benchmark_pt.py b/examples/03_bert/benchmark_pt.py
index 586df4fea..b64800da6 100644
--- a/examples/03_bert/benchmark_pt.py
+++ b/examples/03_bert/benchmark_pt.py
@@ -20,7 +20,7 @@
 
 
 def benchmark_pt(pretrained=True, batchsize=0):
-    bert = BertBaseUncased(pretrained)
+    bert = BertBaseUncased(pretrained=pretrained)
     model = bert._model
     model.eval()
 
@@ -70,7 +70,7 @@ def benchmark_pt(pretrained=True, batchsize=0):
 
 
 def benchmark_pt_encoders_only(pretrained=True, batchsize=0):
-    model = BertBaseUncased(pretrained)
+    model = BertBaseUncased(pretrained=pretrained)
     pt_bert = model._model
     pt_bert.eval()
 
diff --git a/examples/03_bert/demo.py b/examples/03_bert/demo.py
index d783b6423..f23dcf9d7 100644
--- a/examples/03_bert/demo.py
+++ b/examples/03_bert/demo.py
@@ -16,13 +16,14 @@
 
 import torch
 
-from benchmark_ait import compile_module
-from modeling.torch_model import BertBaseUncased as BertPt
 from transformers import BertTokenizer
 
+from .benchmark_ait import compile_module
+from .modeling.torch_model import BertBaseUncased as BertPt
 
-def prepare_data(prompt: str):
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+def prepare_data(prompt: str, model_path: str):
+    tokenizer = BertTokenizer.from_pretrained(model_path)
     result = tokenizer(prompt, return_attention_mask=False, return_tensors="pt")
     target_size = result["input_ids"].size()
     if target_size[1] > 512:
@@ -38,13 +39,18 @@ def prepare_data(prompt: str):
 
 
 def run_model(
-    prompt: str, activation: str, graph_mode: bool, use_fp16_acc: bool, verify: bool
+    prompt: str,
+    activation: str,
+    graph_mode: bool,
+    use_fp16_acc: bool,
+    verify: bool,
+    model_path="bert-base-uncased",
 ):
-    inputs = prepare_data(prompt)
+    inputs = prepare_data(prompt, model_path)
     inputs_pt = {name: data.cuda() for name, data in inputs.items()}
     batch_size, seq_len = inputs["input_ids"].size()
 
-    pt_model = BertPt(pretrained=True)._model
+    pt_model = BertPt(model_path=model_path, pretrained=True)._model
     pt_model.eval()
     hidden_size = pt_model.config.hidden_size
 
diff --git a/examples/03_bert/modeling/torch_model.py b/examples/03_bert/modeling/torch_model.py
index cbc965c70..7e5ae83f0 100644
--- a/examples/03_bert/modeling/torch_model.py
+++ b/examples/03_bert/modeling/torch_model.py
@@ -17,14 +17,12 @@
 
 
 class BertBaseUncased:
-    def __init__(self, pretrained=True):
+    def __init__(self, model_path="bert-base-uncased", pretrained=True):
         if not pretrained:
-            pretrained = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
+            pretrained = AutoModelForMaskedLM.from_pretrained(model_path)
             self._model = BertForMaskedLM(pretrained.config).cuda().half()
         else:
-            self._model = (
-                AutoModelForMaskedLM.from_pretrained("bert-base-uncased").cuda().half()
-            )
+            self._model = AutoModelForMaskedLM.from_pretrained(model_path).cuda().half()
         self._vocab_size = 30522
 
     def forward(self, *args, **kwargs):
diff --git a/examples/03_bert/test_correctness.py b/examples/03_bert/test_correctness.py
new file mode 100644
index 000000000..7cf6d4201
--- /dev/null
+++ b/examples/03_bert/test_correctness.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import unittest
+
+import torch
+
+from .demo import run_model
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+
+class BertBaseUncasedTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def test_bert_base_uncased(self):
+        model_path = "bert-base-uncased"
+        if ManifoldClient is not None:
+            model_path = "/tmp/aitemplate_bert/bert-base-uncased"
+            os.makedirs(model_path, exist_ok=True)
+            with ManifoldClient.get_client(bucket="glow_test_data") as client:
+                await_sync(
+                    client.getRecursive(
+                        manifold_path="tree/aitemplate/bert/bert-base-uncased",
+                        local_path=model_path,
+                    )
+                )
+        run_model(
+            prompt="The quick brown fox jumps over the lazy dog.",
+            activation="fast_gelu",
+            graph_mode=True,
+            use_fp16_acc=True,
+            verify=True,
+            model_path=model_path,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/04_vit/benchmark_ait.py b/examples/04_vit/benchmark_ait.py
index c302d297d..f01475b98 100644
--- a/examples/04_vit/benchmark_ait.py
+++ b/examples/04_vit/benchmark_ait.py
@@ -126,14 +126,14 @@ def benchmark(model_name, batch_size, mod=None, graph_mode=True):
             params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
 
     # set weights
-    for name, weight in params_ait.items():
-        mod.set_constant_with_tensor(name, weight)
+    mod.set_many_constants_with_tensors(params_ait)
+    mod.fold_constants(sync=True)
 
     # prepare input/output tensor
     inputs = [torch.randn([batch_size, img_size, img_size, 3]).cuda().half()]
     ys = []
-    num_ouputs = len(mod.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(mod.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = mod.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     # warm up
diff --git a/examples/04_vit/test_correctness.py b/examples/04_vit/test_correctness.py
new file mode 100644
index 000000000..8d30af6bb
--- /dev/null
+++ b/examples/04_vit/test_correctness.py
@@ -0,0 +1,184 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import io
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.testing import detect_target
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+from parameterized import parameterized
+
+from timm.models.vision_transformer import vit_base_patch16_224, vit_large_patch16_384
+
+from .modeling.vision_transformer import VisionTransformer
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
+        print("output_{} shape: {}".format(i, y_shape))
+
+
+def compile_vit(
+    batch_size=128,
+    img_size=224,
+    patch_size=16,
+    embed_dim=768,
+    num_heads=12,
+    depth=12,
+    class_token=True,
+    global_pool="token",
+    use_fp16_acc=True,
+):
+    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
+    ait_model = VisionTransformer(
+        batch_size=batch_size,
+        img_size=img_size,
+        class_token=class_token,
+        global_pool=global_pool,
+        num_heads=num_heads,
+        embed_dim=embed_dim,
+        patch_size=patch_size,
+        depth=depth,
+        act_layer="GELU",
+    )
+    ait_model.name_parameter_tensor()
+    inputs_ait = Tensor(
+        [batch_size, img_size, img_size, 3], name="input0", is_input=True
+    )
+    Y = ait_model(inputs_ait)
+    mark_output(Y)
+
+    target = detect_target(use_fp16_acc=use_fp16_acc)
+    exe_module = compile_model(
+        Y, target, "./tmp", "vision_transformer_bs%d_seq%d" % (batch_size, seqlen)
+    )
+    return exe_module
+
+
+class VITVerification(unittest.TestCase):
+    @parameterized.expand(["vit_base_patch16_224", "vit_large_patch16_384"])
+    def test_vit(self, model_name):
+        if model_name == "vit_base_patch16_224":
+            img_size = 224
+            depth = 12
+            embed_dim = 768
+            num_heads = 12
+            global_pool = "token"
+            vit_pt_def = vit_base_patch16_224
+            path = "tree/aitemplate/vit-pt/vit_base_patch16_224.pt"
+
+        elif model_name == "vit_large_patch16_384":
+            img_size = 384
+            depth = 24
+            embed_dim = 1024
+            num_heads = 16
+            vit_pt_def = vit_large_patch16_384
+            path = "tree/aitemplate/vit-pt/vit_large_patch16_384.pt"
+        if ManifoldClient is None:
+            vit_pt = vit_pt_def(pretrained=True)
+        else:
+            stream = io.BytesIO()
+            with ManifoldClient.get_client(bucket="glow_test_data") as client:
+                await_sync(
+                    client.get(
+                        path,
+                        stream,
+                    )
+                )
+            stream.seek(0)
+            vit_pt = vit_pt_def(pretrained=False)
+            vit_pt.load_state_dict(torch.load(stream))
+        global_pool = "token"
+        patch_size = 16
+        vit_pt = vit_pt.cuda().half()
+        batch_size = 1
+        vit_ait = compile_vit(
+            batch_size=batch_size,
+            img_size=img_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            depth=depth,
+            class_token=True,
+            global_pool=global_pool,
+            use_fp16_acc=False,
+        )
+        nc = 3
+        seqlen = (img_size // patch_size) ** 2 + 1
+
+        # prepare params
+        params_pt = vit_pt.named_parameters()
+        params_ait = {}
+        for key, arr in params_pt:
+            ait_key = key.replace(".", "_")
+            if len(arr.shape) == 4:
+                arr = arr.permute((0, 2, 3, 1)).contiguous()
+                if detect_target().name() == "cuda":
+                    conv0_w_pad = (
+                        torch.zeros((embed_dim, patch_size, patch_size, 4))
+                        .cuda()
+                        .half()
+                    )
+                    conv0_w_pad[:, :, :, :3] = arr
+                    arr = conv0_w_pad
+            params_ait[f"{ait_key}"] = arr
+        params_ait["cls_token_mask"] = (
+            torch.zeros((batch_size, 1, embed_dim)).cuda().half()
+        )
+        if detect_target().name() == "cuda":
+            ait_key = "attn_cu_length"
+            for i in range(depth):
+                prefix = "blocks_%d" % (i)
+                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
+                params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
+
+        # set weights
+        for name, weight in params_ait.items():
+            vit_ait.set_constant_with_tensor(name, weight)
+
+        with torch.no_grad():
+            x_pt = (
+                torch.rand(
+                    (batch_size, nc, img_size, img_size),
+                    dtype=torch.float16,
+                    device="cuda",
+                )
+                * 255
+            )
+            x_ait = x_pt.permute(0, 2, 3, 1).contiguous()
+            y_pt = vit_pt(x_pt).reshape(batch_size, 1, -1)
+            y_ait = torch.empty_like(y_pt)
+            vit_ait.run_with_tensors([x_ait], [y_ait])
+            torch.testing.assert_close(y_ait, y_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/04_vit/verification.py b/examples/04_vit/verification.py
index 0584707bf..8a84444e8 100644
--- a/examples/04_vit/verification.py
+++ b/examples/04_vit/verification.py
@@ -129,13 +129,13 @@ def verification(
             params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()
 
     # set weights
-    for name, weight in params_ait.items():
-        ait_mod.set_constant_with_tensor(name, weight)
+    ait_mod.set_many_constants_with_tensors(params_ait)
+    ait_mod.fold_constants(sync=True)
 
     inputs = [input_pt.permute((0, 2, 3, 1)).contiguous()]
     ys = []
-    num_ouputs = len(ait_mod.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(ait_mod.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = ait_mod.get_output_maximum_shape(i)
         ys.append(torch.empty(shape).cuda().half())
     ait_mod.run_with_tensors(inputs, ys)
diff --git a/examples/05_stable_diffusion/.gitignore b/examples/05_stable_diffusion/.gitignore
new file mode 100644
index 000000000..e33609d25
--- /dev/null
+++ b/examples/05_stable_diffusion/.gitignore
@@ -0,0 +1 @@
+*.png
diff --git a/examples/05_stable_diffusion/README.md b/examples/05_stable_diffusion/README.md
index 1f62403de..8f589f70e 100644
--- a/examples/05_stable_diffusion/README.md
+++ b/examples/05_stable_diffusion/README.md
@@ -6,59 +6,113 @@ In this example, we show how to build fast AIT modules for CLIP, UNet, VAE model
 
 First, clone, build, and install AITemplate [per the README instructions](https://github.com/facebookincubator/AITemplate#clone-the-code).
 
-This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`.
+This AIT stable diffusion example depends on `diffusers`, `transformers`, `torch` and `click`. You could install them using `pip`.
 
-Verify the library versions. We have tested transformers 4.21/4.22/4.23, diffusers 0.3/0.4 and torch 1.11/1.12.
+Verify the library versions. We have tested transformers==4.25, diffusers==0.11[torch] and torch==1.12.
 
 ```
 >>> import transformers
 >>> transformers.__version__
-'4.21.2'
+'4.25.0'
 >>> import diffusers
 >>> diffusers.__version__
-'0.3.0'
+'0.11.0'
+>>> import torch
 >>> torch.__version__
-'1.12.1+cu116'
+'1.12.0+cu113'
+```
+
+### Download the diffusers pipeline files
+Optionally, you can use Hugging Face access token. You can register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
+
+stable-diffusion model has two variants - base and regular.
+For example:
+- `stabilityai/stable-diffusion-2-1-base` - image resolution 512x512
+- `stabilityai/stable-diffusion-2-1` - image resolution 768x768
+
+```
+python3 scripts/download_pipeline.py \
+--model-name "stabilityai/stable-diffusion-2-1-base"
+
+# Optionally, you can use access token
+--token ACCESS_TOKEN
 ```
 
 ### Build AIT modules for CLIP, UNet, VAE
 
-Build the AIT modules by running `compile.py`. You must first register in Hugging Face Hub to obtain an access token for the Stable Diffusion weights. See [user access tokens](https://huggingface.co/docs/hub/security-tokens) for more info. Your access tokens are listed in your [Hugging Face account settings](https://huggingface.co/settings/tokens).
+Build the AIT modules by running `compile.py`.
 
+Set correct width and height depending on the model variant
 ```
-python3 examples/05_stable_diffusion/compile.py --token ACCESS_TOKEN
+python3 scripts/compile.py --width 512 --height 512
 ```
 It generates three folders: `./tmp/CLIPTextModel`, `./tmp/UNet2DConditionModel`, `./tmp/AutoencoderKL`. In each folder, there is a `test.so` file which is the generated AIT module for the model.
 
-Compile the img2img models:
+#### Alternative build script
+
+```
+python3 scripts/compile_alt.py --width 64 1536 --height 64 1536 --batch-size 1 4 --clip-chunks 6
+```
+This compiles modules with dynamic shape. In the example, modules will work with width in range 64-1536px, batch sizes 1-4. Clip chunks refers to the number of tokens accepted by UNet in multiples of 77, 1 chunk = 77 tokens, 3 chunks = 231 tokens.
+By default, `compile_alt.py` does not include model weights (constants) with the compiled module, to include the model weights in the compiled module use `--include-consants True`.
+
+#### Alternative pipeline
+
+The original pipeline requires a diffusers model local dir, and relies directly on `StableDiffusionPipeline`. This pipeline builds similar functionality without directly using `StableDiffusionPipeline`, and is capable of loading model weights from either diffusers or [CompVis](https://huggingface.co/CompVis) models to compiled aitemplate modules.
+
+* AITemplate modules are created
+* Model weights are loaded, converted/mapped, then applied to AITemplate module
+* Tokenizer is created from `openai/clip-vit-large-patch14`.
+* Scheduler is created from `hf-hub-or-path`.
+* Loading CLIPTextModel from `ckpt` requires the appropriate `hf-hub-or-path` to be specified i.e. `runwayml/stable-diffusion-v1-5` for SD1.x checkpoints, `stabilityai/stable-diffusion-2-1` for SD2.x checkpoints.
+
 ```
-python3 examples/05_stable_diffusion/compile.py --img2img True --token ACCESS_TOKEN
+python3 scripts/demo_alt.py \
+--hf-hub-or-path runwayml/stable-diffusion-v1-5 \
+--ckpt v1-5-pruned-emaonly.ckpt
+
+python3 scripts/demo_alt.py \
+--hf-hub-or-path stabilityai/stable-diffusion-2-1 \
+--ckpt v2-1_768-ema-pruned.ckpt
 ```
 
+`--ckpt` takes preference over `--hf-hub-or-path` if both are specified
+
 #### Multi-GPU profiling
 AIT needs to do profiling to select the best algorithms for CUTLASS and CK.
 To enable multiple GPUs for profiling, use the environment variable `CUDA_VISIBLE_DEVICES` on NVIDIA platform and `HIP_VISIBLE_DEVICES` on AMD platform.
 
 ### Benchmark
 
-This step is optional. You can run `benchmark.py` with the access token to initialize the weights and benchmark.
+This step is optional. You can run `benchmark.py` to measure throughput for each of the subnets.
+
+Benchmark script supports base model variant only for now - 512x512
+```
+python3 src/benchmark.py
+```
+
+### Verify
+
+This step is optional. You can verify numerical correctness for each of the subnets.
 
 ```
-python3 examples/05_stable_diffusion/benchmark.py --token ACCESS_TOKEN
+HUGGINGFACE_AUTH_TOKEN=ACCESS_TOKEN python3 -m unittest src/test_correctness.py
 ```
 
 ### Run Models
 
 Run AIT models with an example image:
 
+Set correct width and height depending on the model variant
 ```
-python3 examples/05_stable_diffusion/demo.py --token ACCESS_TOKEN
+python3 scripts/demo.py --width 512 --height 512
 ```
 
 Img2img demo:
 
+Set correct width and height depending on the model variant
 ```
-python3 examples/05_stable_diffusion/demo_img2img.py --token ACCESS_TOKEN
+python3 scripts/demo_img2img.py --width 512 --height 512
 ```
 
 Check the resulted image: `example_ait.png`
@@ -66,15 +120,15 @@ Check the resulted image: `example_ait.png`
 
 ### Sample outputs
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Mountain Rainier in van Gogh's world"`
+Command: `python3 scripts/demo.py --prompt "Mountain Rainier in van Gogh's world"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_rainier.png)
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "Sitting in a tea house in Japan with Mount Fuji in the background, sunset professional portrait, Nikon 85mm f/1.4G"`
+Command: `python3 scripts/demo.py --prompt "Sitting in a tea house in Japan with Mount Fuji in the background, sunset professional portrait, Nikon 85mm f/1.4G"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_fuji.png)
 
-Command: `python3 examples/05_stable_diffusion/demo.py --token hf_xxx --prompt "A lot of wild flowers with North Cascade Mountain in background, sunset professional photo, Unreal Engine"`
+Command: `scripts/demo.py --prompt "A lot of wild flowers with North Cascade Mountain in background, sunset professional photo, Unreal Engine"`
 
 ![sample](https://raw.githubusercontent.com/AITemplate/webdata/main/imgs/example_ait_cascade2.png)
 
diff --git a/examples/05_stable_diffusion/compile.py b/examples/05_stable_diffusion/compile.py
deleted file mode 100644
index f9f5224df..000000000
--- a/examples/05_stable_diffusion/compile.py
+++ /dev/null
@@ -1,379 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import logging
-from collections import OrderedDict
-
-import click
-import numpy as np
-
-import torch
-
-from aitemplate.compiler import compile_model
-from aitemplate.frontend import Tensor
-from aitemplate.testing import detect_target
-from diffusers import StableDiffusionPipeline
-
-from modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
-
-from modeling.unet_2d_condition import UNet2DConditionModel as ait_UNet2DConditionModel
-
-from modeling.vae import AutoencoderKL as ait_AutoencoderKL
-
-
-USE_CUDA = detect_target().name() == "cuda"
-
-access_token = True
-pipe = None
-
-
-def mark_output(y):
-    if type(y) is not tuple:
-        y = (y,)
-    for i in range(len(y)):
-        y[i]._attrs["is_output"] = True
-        y[i]._attrs["name"] = "output_%d" % (i)
-        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
-        print("AIT output_{} shape: {}".format(i, y_shape))
-
-
-def map_unet_params(pt_mod, dim):
-    pt_params = dict(pt_mod.named_parameters())
-    params_ait = {}
-    for key, arr in pt_params.items():
-        if len(arr.shape) == 4:
-            arr = arr.permute((0, 2, 3, 1)).contiguous()
-        elif key.endswith("ff.net.0.proj.weight"):
-            w1, w2 = arr.chunk(2, dim=0)
-            params_ait[key.replace(".", "_")] = w1
-            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
-            continue
-        elif key.endswith("ff.net.0.proj.bias"):
-            w1, w2 = arr.chunk(2, dim=0)
-            params_ait[key.replace(".", "_")] = w1
-            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
-            continue
-        params_ait[key.replace(".", "_")] = arr
-
-    params_ait["arange"] = (
-        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
-    )
-    return params_ait
-
-
-def map_vae_params(ait_module, pt_module, batch_size, seq_len):
-    pt_params = dict(pt_module.named_parameters())
-    mapped_pt_params = OrderedDict()
-    for name, _ in ait_module.named_parameters():
-        ait_name = name.replace(".", "_")
-        if name in pt_params:
-            if (
-                "conv" in name
-                and "norm" not in name
-                and name.endswith(".weight")
-                and len(pt_params[name].shape) == 4
-            ):
-                mapped_pt_params[ait_name] = torch.permute(
-                    pt_params[name], [0, 2, 3, 1]
-                ).contiguous()
-            else:
-                mapped_pt_params[ait_name] = pt_params[name]
-        elif name.endswith("attention.qkv.weight"):
-            prefix = name[: -len("attention.qkv.weight")]
-            q_weight = pt_params[prefix + "query.weight"]
-            k_weight = pt_params[prefix + "key.weight"]
-            v_weight = pt_params[prefix + "value.weight"]
-            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
-            mapped_pt_params[ait_name] = qkv_weight
-        elif name.endswith("attention.qkv.bias"):
-            prefix = name[: -len("attention.qkv.bias")]
-            q_bias = pt_params[prefix + "query.bias"]
-            k_bias = pt_params[prefix + "key.bias"]
-            v_bias = pt_params[prefix + "value.bias"]
-            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
-            mapped_pt_params[ait_name] = qkv_bias
-        elif name.endswith("attention.proj.weight"):
-            prefix = name[: -len("attention.proj.weight")]
-            pt_name = prefix + "proj_attn.weight"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.proj.bias"):
-            prefix = name[: -len("attention.proj.bias")]
-            pt_name = prefix + "proj_attn.bias"
-            mapped_pt_params[ait_name] = pt_params[pt_name]
-        elif name.endswith("attention.cu_length"):
-            cu_len = np.cumsum([0] + [seq_len] * batch_size).astype("int32")
-            mapped_pt_params[ait_name] = torch.from_numpy(cu_len).cuda()
-        else:
-            pt_param = pt_module.get_parameter(name)
-            mapped_pt_params[ait_name] = pt_param
-
-    return mapped_pt_params
-
-
-def map_clip_params(pt_mod, batch_size, seqlen, depth):
-
-    params_pt = list(pt_mod.named_parameters())
-
-    params_ait = {}
-    pt_params = {}
-    for key, arr in params_pt:
-        pt_params[key.replace("text_model.", "")] = arr
-
-    pt_params = dict(pt_mod.named_parameters())
-    for key, arr in pt_params.items():
-        name = key.replace("text_model.", "")
-        ait_name = name.replace(".", "_")
-        if name.endswith("out_proj.weight"):
-            ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("out_proj.bias"):
-            ait_name = ait_name.replace("out_proj", "proj")
-        elif name.endswith("q_proj.weight"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.weight")]
-            q = pt_params[prefix + "q_proj.weight"]
-            k = pt_params[prefix + "k_proj.weight"]
-            v = pt_params[prefix + "v_proj.weight"]
-            qkv_weight = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_weight
-            continue
-        elif name.endswith("q_proj.bias"):
-            ait_name = ait_name.replace("q_proj", "qkv")
-            prefix = key[: -len("q_proj.bias")]
-            q = pt_params[prefix + "q_proj.bias"]
-            k = pt_params[prefix + "k_proj.bias"]
-            v = pt_params[prefix + "v_proj.bias"]
-            qkv_bias = torch.cat([q, k, v], dim=0)
-            params_ait[ait_name] = qkv_bias
-            continue
-        elif name.endswith("k_proj.weight"):
-            continue
-        elif name.endswith("k_proj.bias"):
-            continue
-        elif name.endswith("v_proj.weight"):
-            continue
-        elif name.endswith("v_proj.bias"):
-            continue
-        params_ait[ait_name] = arr
-
-        if USE_CUDA:
-            for i in range(depth):
-                prefix = "encoder_layers_%d_self_attn_cu_length" % (i)
-                cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
-                params_ait[prefix] = torch.from_numpy(cu_len).cuda()
-
-    return params_ait
-
-
-def compile_unet(
-    batch_size=2,
-    hh=64,
-    ww=64,
-    dim=320,
-    hidden_dim=1024,
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
-):
-
-    ait_mod = ait_UNet2DConditionModel(
-        sample_size=64,
-        cross_attention_dim=hidden_dim,
-        attention_head_dim=[5, 10, 20, 20],
-    )
-    ait_mod.name_parameter_tensor()
-
-    # set AIT parameters
-    pt_mod = pipe.unet
-    pt_mod = pt_mod.eval()
-    params_ait = map_unet_params(pt_mod, dim)
-
-    latent_model_input_ait = Tensor(
-        [batch_size, hh, ww, 4], name="input0", is_input=True
-    )
-    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
-    text_embeddings_pt_ait = Tensor(
-        [batch_size, 64, hidden_dim], name="input2", is_input=True
-    )
-
-    Y = ait_mod(latent_model_input_ait, timesteps_ait, text_embeddings_pt_ait)
-    mark_output(Y)
-
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(Y, target, "./tmp", "UNet2DConditionModel", constants=params_ait)
-
-
-def compile_clip(
-    batch_size=1,
-    seqlen=64,
-    dim=768,
-    num_heads=12,
-    use_fp16_acc=False,
-    convert_conv_to_gemm=False,
-):
-    mask_seq = 0
-    causal = True
-    depth = 23
-
-    ait_mod = ait_CLIPTextTransformer(
-        num_hidden_layers=depth,
-        hidden_size=dim,
-        num_attention_heads=num_heads,
-        batch_size=batch_size,
-        seq_len=seqlen,
-        causal=causal,
-        mask_seq=mask_seq,
-    )
-    ait_mod.name_parameter_tensor()
-
-    pt_mod = pipe.text_encoder
-    pt_mod = pt_mod.eval()
-    params_ait = map_clip_params(pt_mod, batch_size, seqlen, depth)
-
-    input_ids_ait = Tensor(
-        [batch_size, seqlen], name="input0", dtype="int64", is_input=True
-    )
-    position_ids_ait = Tensor(
-        [batch_size, seqlen], name="input1", dtype="int64", is_input=True
-    )
-    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
-    mark_output(Y)
-
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
-
-
-def compile_vae(
-    batch_size=1, height=64, width=64, use_fp16_acc=False, convert_conv_to_gemm=False
-):
-    in_channels = 3
-    out_channels = 3
-    down_block_types = [
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-        "DownEncoderBlock2D",
-    ]
-    up_block_types = [
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-        "UpDecoderBlock2D",
-    ]
-    block_out_channels = [128, 256, 512, 512]
-    layers_per_block = 2
-    act_fn = "silu"
-    latent_channels = 4
-    sample_size = 512
-
-    ait_vae = ait_AutoencoderKL(
-        batch_size,
-        height,
-        width,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        down_block_types=down_block_types,
-        up_block_types=up_block_types,
-        block_out_channels=block_out_channels,
-        layers_per_block=layers_per_block,
-        act_fn=act_fn,
-        latent_channels=latent_channels,
-        sample_size=sample_size,
-    )
-    ait_input = Tensor(
-        shape=[batch_size, height, width, latent_channels],
-        name="vae_input",
-        is_input=True,
-    )
-    ait_vae.name_parameter_tensor()
-
-    pt_mod = pipe.vae
-    pt_mod = pt_mod.eval()
-    params_ait = map_vae_params(ait_vae, pt_mod, batch_size, height * width)
-
-    Y = ait_vae.decode(ait_input)
-    mark_output(Y)
-    target = detect_target(
-        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
-    )
-    compile_model(
-        Y,
-        target,
-        "./tmp",
-        "AutoencoderKL",
-        constants=params_ait,
-    )
-
-
-@click.command()
-@click.option("--token", default="", help="access token")
-@click.option("--width", default=512, help="Width of generated image")
-@click.option("--height", default=512, help="Height of generated image")
-@click.option("--batch-size", default=1, help="batch size")
-@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
-@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
-def compile_diffusers(
-    token, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
-):
-    logging.getLogger().setLevel(logging.INFO)
-    np.random.seed(0)
-    torch.manual_seed(4896)
-
-    if detect_target().name() == "rocm":
-        convert_conv_to_gemm = False
-
-    global access_token, pipe
-    if token != "":
-        access_token = token
-
-    pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
-        revision="fp16",
-        torch_dtype=torch.float16,
-        use_auth_token=access_token,
-    ).to("cuda")
-
-    ww = width // 8
-    hh = height // 8
-
-    # CLIP
-    compile_clip(
-        batch_size=batch_size,
-        dim=1024,
-        num_heads=16,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-    # UNet
-    compile_unet(
-        batch_size=batch_size * 2,
-        ww=ww,
-        hh=hh,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-    # VAE
-    compile_vae(
-        batch_size=batch_size,
-        width=ww,
-        height=hh,
-        use_fp16_acc=use_fp16_acc,
-        convert_conv_to_gemm=convert_conv_to_gemm,
-    )
-
-
-if __name__ == "__main__":
-    compile_diffusers()
diff --git a/examples/05_stable_diffusion/modeling/vae.py b/examples/05_stable_diffusion/modeling/vae.py
deleted file mode 100644
index 6a239f233..000000000
--- a/examples/05_stable_diffusion/modeling/vae.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-"""
-Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
-"""
-
-from typing import Tuple
-
-from aitemplate.frontend import nn, Tensor
-from modeling.unet_blocks import get_up_block, UNetMidBlock2D
-
-
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        batch_size,
-        height,
-        width,
-        in_channels=3,
-        out_channels=3,
-        up_block_types=("UpDecoderBlock2D",),
-        block_out_channels=(64,),
-        layers_per_block=2,
-        act_fn="silu",
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2dBias(
-            in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1
-        )
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            batch_size,
-            height,
-            width,
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attn_num_head_channels=None,
-            resnet_groups=32,
-            temb_channels=None,
-        )
-
-        # up
-        self.up_blocks = nn.ModuleList([])
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                attn_num_head_channels=None,
-                temb_channels=None,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        num_groups_out = 32
-        self.conv_norm_out = nn.GroupNorm(
-            num_channels=block_out_channels[0],
-            num_groups=num_groups_out,
-            eps=1e-6,
-            use_swish=True,
-        )
-        self.conv_out = nn.Conv2dBias(
-            block_out_channels[0], out_channels, kernel_size=3, padding=1, stride=1
-        )
-
-    def forward(self, z) -> Tensor:
-        sample = z
-        sample = self.conv_in(sample)
-
-        # middle
-        sample = self.mid_block(sample)
-
-        # up
-        for up_block in self.up_blocks:
-            sample = up_block(sample)
-
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class AutoencoderKL(nn.Module):
-    def __init__(
-        self,
-        batch_size: int,
-        height: int,
-        width: int,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        sample_size: int = 32,
-    ):
-        super().__init__()
-        self.decoder = Decoder(
-            batch_size,
-            height,
-            width,
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-        )
-        self.post_quant_conv = nn.Conv2dBias(
-            latent_channels, latent_channels, kernel_size=1, stride=1, padding=0
-        )
-
-    def decode(self, z: Tensor, return_dict: bool = True):
-
-        z = self.post_quant_conv(z)
-        dec = self.decoder(z)
-        return dec
-
-    def forward(self):
-        raise NotImplementedError("Only decode() is implemented for AutoencoderKL!")
diff --git a/examples/05_stable_diffusion/scripts/compile.py b/examples/05_stable_diffusion/scripts/compile.py
new file mode 100644
index 000000000..65032e34f
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import StableDiffusionPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_clip import compile_clip
+from src.compile_lib.compile_unet import compile_unet
+from src.compile_lib.compile_vae import compile_vae
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch-size", default=1, help="batch size")
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(
+    local_dir, width, height, batch_size, use_fp16_acc=True, convert_conv_to_gemm=True
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    assert (
+        height % 64 == 0 and width % 64 == 0
+    ), "Height and Width must be multiples of 64, otherwise, the compilation process will fail."
+
+    ww = width // 8
+    hh = height // 8
+
+    # CLIP
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=77,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+    )
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size * 2,
+        width=ww,
+        height=hh,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
+    )
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=ww,
+        height=hh,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/compile_alt.py b/examples/05_stable_diffusion/scripts/compile_alt.py
new file mode 100644
index 000000000..922cf03b7
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_alt.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import StableDiffusionPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_clip_alt import compile_clip
+from src.compile_lib.compile_unet_alt import compile_unet
+from src.compile_lib.compile_vae_alt import compile_vae
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    help="the local diffusers pipeline directory",
+)
+@click.option(
+    "--width",
+    default=(64, 2048),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum width",
+)
+@click.option(
+    "--height",
+    default=(64, 2048),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum height",
+)
+@click.option(
+    "--batch-size",
+    default=(1, 4),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum batch size",
+)
+@click.option("--clip-chunks", default=6, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=None,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+@click.option("--controlnet", default=False, help="UNet for controlnet")
+def compile_diffusers(
+    local_dir,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    controlnet=False,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    assert (
+        width[0] % 64 == 0 and width[1] % 64 == 0
+    ), "Minimum Width and Maximum Width must be multiples of 64, otherwise, the compilation process will fail."
+    assert (
+        height[0] % 64 == 0 and height[1] % 64 == 0
+    ), "Minimum Height and Maximum Height must be multiples of 64, otherwise, the compilation process will fail."
+
+    pipe = StableDiffusionPipeline.from_pretrained(
+        local_dir,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    # CLIP
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=77,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+        constants=True if include_constants else False,
+    )
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
+        constants=True if include_constants else False,
+        controlnet=True if controlnet else False,
+    )
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        constants=True if include_constants else False,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/compile_controlnet.py b/examples/05_stable_diffusion/scripts/compile_controlnet.py
new file mode 100644
index 000000000..e108acc80
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_controlnet.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import ControlNetModel
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_controlnet import compile_controlnet
+
+
+@click.command()
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/runwayml/stable-diffusion-v1-5",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, type=int, help="width")
+@click.option("--height", default=512, type=int, help="height")
+@click.option("--batch-size", default=1, type=int, help="batch size")
+@click.option("--clip-chunks", default=6, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=None,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+def compile_diffusers(
+    local_dir,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    assert (
+        width % 64 == 0
+    ), "Width must be multiples of 64, otherwise, the compilation process will fail."
+    assert (
+        height % 64 == 0
+    ), "Height must be multiples of 64, otherwise, the compilation process will fail."
+
+    controlnet = ControlNetModel.from_pretrained(
+        "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
+    ).to("cuda")
+
+    compile_controlnet(
+        controlnet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        use_fp16_acc=use_fp16_acc,
+        constants=include_constants,
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/scripts/compile_sdxl.py b/examples/05_stable_diffusion/scripts/compile_sdxl.py
new file mode 100644
index 000000000..1b88bc123
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/compile_sdxl.py
@@ -0,0 +1,214 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import sys
+
+sys.setrecursionlimit(10000)
+
+import click
+import torch
+from aitemplate.testing import detect_target
+from aitemplate.utils.import_path import import_parent
+from diffusers import AutoencoderKL, DiffusionPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.compile_lib.compile_clip_alt import compile_clip
+from src.compile_lib.compile_unet_alt import compile_timestep_embedder, compile_unet
+from src.compile_lib.compile_vae_alt import compile_vae
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="stabilityai/stable-diffusion-xl-base-1.0",
+    help="the local or hf hub path e.g. stabilityai/stable-diffusion-xl-base-1.0",
+)
+@click.option(
+    "--width",
+    default=(1024, 1024),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum width",
+)
+@click.option(
+    "--height",
+    default=(1024, 1024),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum height",
+)
+@click.option(
+    "--batch-size",
+    default=(1, 1),
+    type=(int, int),
+    nargs=2,
+    help="Minimum and maximum batch size",
+)
+@click.option("--clip-chunks", default=10, help="Maximum number of clip chunks")
+@click.option(
+    "--include-constants",
+    default=False,
+    type=bool,
+    help="include constants (model weights) with compiled model",
+)
+@click.option("--use-fp16-acc", default=True, help="use fp16 accumulation")
+@click.option("--convert-conv-to-gemm", default=True, help="convert 1x1 conv to gemm")
+@click.option("--work-dir", default="./tmp", help="work directory")
+@click.option(
+    "--model-name-prefix", default="SDXL", help="Prefix for compiled module names"
+)
+@click.option(
+    "--fp32-vae",
+    default=False,
+    help="fp32 vae, if false, use https://huggingface.co/madebyollin/sdxl-vae-fp16-fix as replacement vae",
+)
+def compile_diffusers(
+    hf_hub_or_path,
+    width,
+    height,
+    batch_size,
+    clip_chunks,
+    include_constants,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    work_dir="./tmp",
+    model_name_prefix="SDXL",
+    fp32_vae=False,
+):
+    logging.getLogger().setLevel(logging.INFO)
+    torch.manual_seed(4896)
+
+    if detect_target().name() == "rocm":
+        convert_conv_to_gemm = False
+
+    pipe = DiffusionPipeline.from_pretrained(
+        hf_hub_or_path,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+    if fp32_vae:
+        pipe.vae.to("cuda", dtype=torch.float32)
+    else:
+        vae = AutoencoderKL.from_pretrained(
+            "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+        ).to("cuda")
+        pipe.vae = vae
+
+    # text_encoder
+    model_name = f"{model_name_prefix}_text_encoder"
+    compile_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        seqlen=pipe.text_encoder.config.max_position_embeddings,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        output_hidden_states=True,
+        text_projection_dim=None,
+        depth=pipe.text_encoder.config.num_hidden_layers,
+        num_heads=pipe.text_encoder.config.num_attention_heads,
+        dim=pipe.text_encoder.config.hidden_size,
+        act_layer=pipe.text_encoder.config.hidden_act,
+        constants=include_constants,
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    # text_encoder 2
+    model_name = f"{model_name_prefix}_text_encoder_2"
+    compile_clip(
+        pipe.text_encoder_2,
+        batch_size=batch_size,
+        seqlen=pipe.text_encoder_2.config.max_position_embeddings,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        output_hidden_states=True,
+        text_projection_dim=pipe.text_encoder_2.config.projection_dim,
+        depth=pipe.text_encoder_2.config.num_hidden_layers,
+        num_heads=pipe.text_encoder_2.config.num_attention_heads,
+        dim=pipe.text_encoder_2.config.hidden_size,
+        act_layer=pipe.text_encoder_2.config.hidden_act,
+        constants=include_constants,
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    model_name = f"{model_name_prefix}_unet"
+    # UNet
+    compile_unet(
+        pipe.unet,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        clip_chunks=clip_chunks,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        hidden_dim=pipe.unet.config.cross_attention_dim,
+        attention_head_dim=pipe.unet.config.attention_head_dim,
+        use_linear_projection=pipe.unet.config.get("use_linear_projection", False),
+        block_out_channels=pipe.unet.config.block_out_channels,
+        down_block_types=pipe.unet.config.down_block_types,
+        up_block_types=pipe.unet.config.up_block_types,
+        in_channels=pipe.unet.config.in_channels,
+        out_channels=pipe.unet.config.out_channels,
+        class_embed_type=pipe.unet.config.class_embed_type,
+        num_class_embeds=pipe.unet.config.num_class_embeds,
+        only_cross_attention=pipe.unet.config.only_cross_attention,
+        sample_size=pipe.unet.config.sample_size,
+        dim=pipe.unet.config.block_out_channels[0],
+        time_embedding_dim=pipe.unet.config.time_embedding_dim,
+        conv_in_kernel=pipe.unet.config.conv_in_kernel,
+        projection_class_embeddings_input_dim=pipe.unet.config.projection_class_embeddings_input_dim,
+        addition_embed_type=pipe.unet.config.addition_embed_type,
+        transformer_layers_per_block=pipe.unet.config.transformer_layers_per_block,
+        constants=False
+        if sys.platform == "win32"
+        else include_constants,  # Too big, RC : fatal error RW1023: I/O error seeking in file
+        model_name=model_name,
+        work_dir=work_dir,
+    )
+    # `add_time_proj` Timesteps
+    model_name = f"{model_name_prefix}_addition_time_embed"
+    compile_timestep_embedder(
+        pipe.unet.config.addition_time_embed_dim,
+        work_dir=work_dir,
+        model_name=model_name,
+    )
+    model_name = f"{model_name_prefix}_vae"
+    # VAE
+    compile_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        width=width,
+        height=height,
+        use_fp16_acc=use_fp16_acc,
+        convert_conv_to_gemm=convert_conv_to_gemm,
+        constants=include_constants,
+        block_out_channels=pipe.vae.config.block_out_channels,
+        layers_per_block=pipe.vae.config.layers_per_block,
+        act_fn=pipe.vae.config.act_fn,
+        latent_channels=pipe.vae.config.latent_channels,
+        in_channels=pipe.vae.config.in_channels,
+        out_channels=pipe.vae.config.out_channels,
+        down_block_types=pipe.vae.config.down_block_types,
+        up_block_types=pipe.vae.config.up_block_types,
+        sample_size=pipe.vae.config.sample_size,
+        model_name=model_name,
+        work_dir=work_dir,
+        dtype="float32" if fp32_vae else "float16",
+    )
+
+
+if __name__ == "__main__":
+    compile_diffusers()
diff --git a/examples/05_stable_diffusion/demo.py b/examples/05_stable_diffusion/scripts/demo.py
similarity index 58%
rename from examples/05_stable_diffusion/demo.py
rename to examples/05_stable_diffusion/scripts/demo.py
index 1a2fca835..8cddeb7c2 100644
--- a/examples/05_stable_diffusion/demo.py
+++ b/examples/05_stable_diffusion/scripts/demo.py
@@ -12,42 +12,54 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+
 import click
 import torch
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.import_path import import_parent
 from diffusers import EulerDiscreteScheduler
-from pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_ait import StableDiffusionAITPipeline
 
 
 @click.command()
-@click.option("--token", default="", help="access token")
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, width, height, prompt, benchmark):
-
-    model_id = "stabilityai/stable-diffusion-2"
-    scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
-
+def run(local_dir, width, height, batch, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionAITPipeline.from_pretrained(
-        model_id,
-        scheduler=scheduler,
+        local_dir,
+        scheduler=EulerDiscreteScheduler.from_pretrained(
+            local_dir, subfolder="scheduler"
+        ),
         revision="fp16",
         torch_dtype=torch.float16,
-        use_auth_token=token,
     ).to("cuda")
 
+    prompt = [prompt] * batch
     with torch.autocast("cuda"):
-        image = pipe(prompt, height, width).images[0]
+        images = pipe(prompt, height, width).images
         if benchmark:
             t = benchmark_torch_function(10, pipe, prompt, height=height, width=width)
-            print(f"sd e2e: {t} ms")
-
-    image.save("example_ait.png")
+            print(
+                f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
+            )
+    for i, image in enumerate(images):
+        image.save(f"example_ait_{i}.png")
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/scripts/demo_alt.py b/examples/05_stable_diffusion/scripts/demo_alt.py
new file mode 100644
index 000000000..28b322f02
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_alt.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+
+from aitemplate.utils.import_path import import_parent
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_ait_alt import StableDiffusionAITPipeline
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="runwayml/stable-diffusion-v1-5",
+    help="Model weights to apply to compiled model (with --include-constants false)",
+)
+@click.option("--ckpt", default=None, help="e.g. v1-5-pruned-emaonly.ckpt")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option("--steps", default=50, help="Number of inference steps")
+@click.option("--cfg", default=7.5, help="Guidance scale")
+def run(
+    hf_hub_or_path, ckpt, width, height, batch, prompt, negative_prompt, steps, cfg
+):
+    pipe = StableDiffusionAITPipeline(
+        hf_hub_or_path=hf_hub_or_path,
+        ckpt=ckpt,
+    )
+
+    prompt = [prompt] * batch
+    negative_prompt = [negative_prompt] * batch
+    with torch.autocast("cuda"):
+        image = pipe(
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            num_inference_steps=steps,
+            guidance_scale=cfg,
+        ).images[0]
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/scripts/demo_controlnet.py b/examples/05_stable_diffusion/scripts/demo_controlnet.py
new file mode 100644
index 000000000..beacff7d0
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_controlnet.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import cv2
+import numpy as np
+import torch
+from aitemplate.utils.import_path import import_parent
+from diffusers.utils import load_image
+from PIL import Image
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_controlnet_ait import StableDiffusionAITPipeline
+
+
+def prepare_image(
+    image,
+    width,
+    height,
+    batch_size,
+    num_images_per_prompt,
+    device,
+    dtype,
+    do_classifier_free_guidance=False,
+    guess_mode=False,
+):
+    if not isinstance(image, torch.Tensor):
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        if isinstance(image[0], Image.Image):
+            images = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = image_.resize((width, height), resample=Image.LANCZOS)
+                image_ = np.array(image_)
+                image_ = image_[None, :]
+                images.append(image_)
+
+            image = images
+
+            image = np.concatenate(image, axis=0)
+            image = np.array(image).astype(np.float32) / 255.0
+            image = image.transpose(0, 3, 1, 2)
+            image = torch.from_numpy(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+
+    image_batch_size = image.shape[0]
+
+    if image_batch_size == 1:
+        repeat_by = batch_size
+    else:
+        # image batch size is the same as prompt batch size
+        repeat_by = num_images_per_prompt
+
+    image = image.repeat_interleave(repeat_by, dim=0)
+
+    image = image.to(device=device, dtype=dtype)
+
+    if do_classifier_free_guidance and not guess_mode:
+        image = torch.cat([image] * 2)
+
+    return image
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="runwayml/stable-diffusion-v1-5",
+    help="Model weights to apply to compiled model (with --include-constants false)",
+)
+@click.option("--ckpt", default=None, help="e.g. v1-5-pruned-emaonly.ckpt")
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option("--steps", default=50, help="Number of inference steps")
+@click.option("--cfg", default=7.5, help="Guidance scale")
+def run(
+    hf_hub_or_path, ckpt, width, height, batch, prompt, negative_prompt, steps, cfg
+):
+    pipe = StableDiffusionAITPipeline(
+        hf_hub_or_path=hf_hub_or_path,
+        ckpt=ckpt,
+    )
+    # download an image
+    image = load_image(
+        "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+    )
+    image = np.array(image)
+    # get canny image
+    image = cv2.Canny(image, 100, 200)
+    image = image[:, :, None]
+    image = np.concatenate([image, image, image], axis=2)
+    canny_image = Image.fromarray(image)
+    control_cond = prepare_image(
+        canny_image,
+        width,
+        height,
+        batch,
+        1,
+        "cuda",
+        torch.float16,
+        do_classifier_free_guidance=True,
+    )
+    prompt = [prompt] * batch
+    negative_prompt = [negative_prompt] * batch
+    with torch.autocast("cuda"):
+        for _ in range(5):
+            image = pipe(
+                prompt=prompt,
+                control_cond=control_cond,
+                height=height,
+                width=width,
+                negative_prompt=negative_prompt,
+                num_inference_steps=steps,
+                guidance_scale=cfg,
+            ).images[0]
+    image.save("example_ait.png")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/demo_img2img.py b/examples/05_stable_diffusion/scripts/demo_img2img.py
similarity index 80%
rename from examples/05_stable_diffusion/demo_img2img.py
rename to examples/05_stable_diffusion/scripts/demo_img2img.py
index 569a713ed..31e1c33df 100644
--- a/examples/05_stable_diffusion/demo_img2img.py
+++ b/examples/05_stable_diffusion/scripts/demo_img2img.py
@@ -19,12 +19,21 @@
 import torch
 
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.import_path import import_parent
 from PIL import Image
-from pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_img2img_ait import StableDiffusionImg2ImgAITPipeline
 
 
 @click.command()
-@click.option("--token", default="", help="access token")
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--width", default=512, help="Width of generated image")
 @click.option("--height", default=512, help="Height of generated image")
 @click.option(
@@ -33,19 +42,17 @@
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, width, height, prompt, benchmark):
-
+def run(local_dir, width, height, prompt, benchmark):
     # load the pipeline
     device = "cuda"
-    model_id_or_path = "runwayml/stable-diffusion-v1-5"
     pipe = StableDiffusionImg2ImgAITPipeline.from_pretrained(
-        model_id_or_path,
+        local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
-        use_auth_token=token,
+        safety_checker=None,
+        feature_extractor=None,
     )
     pipe = pipe.to(device)
-
     # let's download an initial image
     url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
 
diff --git a/examples/05_stable_diffusion/scripts/demo_xl.py b/examples/05_stable_diffusion/scripts/demo_xl.py
new file mode 100644
index 000000000..9071fcdc2
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/demo_xl.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import click
+import torch
+
+from aitemplate.utils.import_path import import_parent
+from diffusers import AutoencoderKL, DiffusionPipeline
+
+if __name__ == "__main__":
+    import_parent(filepath=__file__, level=1)
+
+from src.pipeline_stable_diffusion_xl_ait import StableDiffusionXLAITPipeline
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+
+@click.command()
+@click.option(
+    "--hf-hub-or-path",
+    default="stabilityai/stable-diffusion-xl-base-1.0",
+    help="huggingface hub name or path to local model",
+)
+@click.option(
+    "--apply-weights",
+    default=True,
+    help="apply weights to module, required for Windows",
+)
+@click.option(
+    "--unet-module",
+    help="path to unet module",
+    required=True,
+)
+@click.option(
+    "--text-encoder-module",
+    help="path to text encoder module",
+    required=True,
+)
+@click.option(
+    "--text-encoder-2-module",
+    help="path to text encoder 2 module",
+    required=True,
+)
+@click.option(
+    "--time-embed-module",
+    help="path to time embed module",
+    required=True,
+)
+@click.option(
+    "--vae-module",
+    help="path to vae module",
+    required=True,
+)
+@click.option("--width", default=1024, help="Width of generated image")
+@click.option("--height", default=1024, help="Height of generated image")
+@click.option("--batch", default=1, help="Batch size of generated image")
+@click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
+@click.option(
+    "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
+)
+def run(
+    hf_hub_or_path,
+    apply_weights,
+    unet_module,
+    text_encoder_module,
+    text_encoder_2_module,
+    time_embed_module,
+    vae_module,
+    width,
+    height,
+    batch,
+    prompt,
+    negative_prompt,
+    benchmark,
+):
+    diffusers_pipe = DiffusionPipeline.from_pretrained(
+        hf_hub_or_path,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    vae = AutoencoderKL.from_pretrained(
+        "madebyollin/sdxl-vae-fp16-fix",
+        use_safetensors=True,
+        torch_dtype=torch.float16,
+    ).to("cuda")
+
+    pipe = StableDiffusionXLAITPipeline(
+        vae,
+        diffusers_pipe.text_encoder,
+        diffusers_pipe.text_encoder_2,
+        diffusers_pipe.tokenizer,
+        diffusers_pipe.tokenizer_2,
+        diffusers_pipe.unet,
+        diffusers_pipe.scheduler,
+        text_encoder_module,
+        text_encoder_2_module,
+        unet_module,
+        vae_module,
+        time_embed_module,
+        apply_weights_to_modules=apply_weights,
+    )
+
+    prompt = [prompt] * batch
+    images = pipe(
+        prompt=prompt,
+        prompt_2=prompt,
+        height=height,
+        width=width,
+        num_inference_steps=20,
+        guidance_scale=8
+    ).images
+    
+    for i, image in enumerate(images):
+        image.save(f"example_ait_{i}.png")
+
+    if benchmark:
+        t = benchmark_torch_function(10, pipe, prompt, prompt_2=prompt, height=height, width=width, num_inference_steps=20, guidance_scale=8)
+        print(
+            f"sd e2e: width={width}, height={height}, batchsize={batch}, latency={t} ms"
+        )
+
+
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/05_stable_diffusion/scripts/download_pipeline.py b/examples/05_stable_diffusion/scripts/download_pipeline.py
new file mode 100644
index 000000000..b5c248b62
--- /dev/null
+++ b/examples/05_stable_diffusion/scripts/download_pipeline.py
@@ -0,0 +1,49 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import click
+import torch
+from diffusers import DiffusionPipeline
+
+
+@click.command()
+@click.option(
+    "--model-name",
+    default="stabilityai/stable-diffusion-2-1-base",
+    help="Pretrained Model name.",
+)
+@click.option(
+    "--token",
+    default="",
+    help="Valid values: Huggingface user access token, 'true' to use token "
+    "generated with 'huggingface-cli login' (stored in ~/.huggingface) "
+    "or empty string to not use access token (default).",
+)
+@click.option(
+    "--save-directory",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="Pipeline files local directory.",
+)
+def download_pipeline_files(model_name, token, save_directory) -> None:
+
+    DiffusionPipeline.from_pretrained(
+        model_name,
+        revision="main" if "xl" in model_name else "fp16",
+        torch_dtype=torch.float16,
+        use_auth_token=token if len(token) > 5 else token.lower() == "true",
+    ).save_pretrained(save_directory)
+
+
+if __name__ == "__main__":
+    download_pipeline_files()
\ No newline at end of file
diff --git a/examples/05_stable_diffusion/src/__init__.py b/examples/05_stable_diffusion/src/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/examples/05_stable_diffusion/src/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/examples/05_stable_diffusion/benchmark.py b/examples/05_stable_diffusion/src/benchmark.py
similarity index 83%
rename from examples/05_stable_diffusion/benchmark.py
rename to examples/05_stable_diffusion/src/benchmark.py
index d2a08d51f..f0f595122 100644
--- a/examples/05_stable_diffusion/benchmark.py
+++ b/examples/05_stable_diffusion/src/benchmark.py
@@ -29,9 +29,6 @@
 
 USE_CUDA = detect_target().name() == "cuda"
 
-access_token = True
-pipe = None
-
 
 def get_int_shape(x):
     shape = [it.value() for it in x._attrs["shape"]]
@@ -49,26 +46,25 @@ def mark_output(y):
 
 
 def benchmark_unet(
+    pt_mod,
     batch_size=2,
-    hh=64,
-    ww=64,
+    height=64,
+    width=64,
     dim=320,
-    hidden_size=1024,
+    hidden_dim=1024,
     benchmark_pt=False,
     verify=False,
 ):
-
     exe_module = Model("./tmp/UNet2DConditionModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for UNet2DConditionModel.")
         exit(-1)
 
     # run PT unet model
-    pt_mod = pipe.unet
     pt_mod = pt_mod.eval()
 
-    latent_model_input_pt = torch.randn(batch_size, 4, hh, ww).cuda().half()
-    text_embeddings_pt = torch.randn(batch_size, 64, hidden_size).cuda().half()
+    latent_model_input_pt = torch.randn(batch_size, 4, height, width).cuda().half()
+    text_embeddings_pt = torch.randn(batch_size, 77, hidden_dim).cuda().half()
     timesteps_pt = torch.Tensor([1, 1]).cuda().half()
 
     with autocast("cuda"):
@@ -86,8 +82,6 @@ def benchmark_unet(
             with open("sd_pt_benchmark.txt", "a") as f:
                 f.write(f"unet batch_size: {batch_size}, latency: {pt_time} ms\n")
 
-    print("pt output:", pt_ys.shape)
-
     # run AIT unet model
     inputs = {
         "input0": latent_model_input_pt.permute((0, 2, 3, 1)).contiguous(),
@@ -96,9 +90,11 @@ def benchmark_unet(
     }
 
     ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[1] = height
+        shape[2] = width
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -125,24 +121,23 @@ def benchmark_unet(
 
 
 def benchmark_clip(
+    pt_mod,
     batch_size=1,
-    seqlen=64,
+    seqlen=77,
+    tokenizer=None,
     benchmark_pt=False,
     verify=False,
 ):
-    mask_seq = 0
-    version = "openai/clip-vit-large-patch14"
-
     exe_module = Model("./tmp/CLIPTextModel/test.so")
     if exe_module is None:
         print("Error!! Cannot find compiled module for CLIPTextModel.")
         exit(-1)
 
     # run PT clip
-    pt_mod = pipe.text_encoder
     pt_mod = pt_mod.eval()
 
-    tokenizer = CLIPTokenizer.from_pretrained(version)
+    if tokenizer is None:
+        tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
     text_input = tokenizer(
         ["a photo of an astronaut riding a horse on mars"] * batch_size,
         padding="max_length",
@@ -152,8 +147,6 @@ def benchmark_clip(
     )
     input_ids = text_input["input_ids"].cuda()
 
-    attention_mask = torch.ones((batch_size, seqlen))
-    attention_mask[-1, -mask_seq:] = 0
     attention_mask = None
 
     position_ids = torch.arange(seqlen).expand((batch_size, -1)).cuda()
@@ -174,9 +167,10 @@ def benchmark_clip(
         "input1": position_ids,
     }
     ys = []
-    num_ouputs = len(exe_module.get_output_name_to_index_map())
-    for i in range(num_ouputs):
+    num_outputs = len(exe_module.get_output_name_to_index_map())
+    for i in range(num_outputs):
         shape = exe_module.get_output_maximum_shape(i)
+        shape[0] = batch_size
         ys.append(torch.empty(shape).cuda().half())
     exe_module.run_with_tensors(inputs, ys)
 
@@ -201,8 +195,9 @@ def benchmark_clip(
         f.write(f"clip batch_size: {batch_size}, latency: {t} ms\n")
 
 
-def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=False):
-
+def benchmark_vae(
+    pt_vae, batch_size=1, height=64, width=64, benchmark_pt=False, verify=False
+):
     latent_channels = 4
 
     exe_module = Model("./tmp/AutoencoderKL/test.so")
@@ -211,7 +206,6 @@ def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=
         exit(-1)
 
     # run PT vae
-    pt_vae = pipe.vae
     pt_vae = pt_vae.cuda().half()
     pt_vae.eval()
 
@@ -240,9 +234,8 @@ def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=
         .cuda()
         .half()
     )
+
     ait_input_pt_tensor = torch.permute(pt_input, (0, 2, 3, 1)).contiguous()
-    print("input pt tensor size: ", ait_input_pt_tensor.shape)
-    print("output pt tensor size: ", y.shape)
     exe_module.run_with_tensors([ait_input_pt_tensor], [y])
 
     # verification
@@ -269,33 +262,48 @@ def benchmark_vae(batch_size=1, height=64, width=64, benchmark_pt=False, verify=
 
 
 @click.command()
-@click.option("--token", default="", help="access token")
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
 @click.option("--batch-size", default=1, help="batch size")
 @click.option("--verify", type=bool, default=False, help="verify correctness")
 @click.option("--benchmark-pt", type=bool, default=False, help="run pt benchmark")
-def benchmark_diffusers(token, batch_size, verify, benchmark_pt):
-    #assert batch_size == 1, "batch size must be 1 for submodule verification"
+def benchmark_diffusers(local_dir, batch_size, verify, benchmark_pt):
+    assert batch_size == 1, "batch size must be 1 for submodule verification"
     logging.getLogger().setLevel(logging.INFO)
     np.random.seed(0)
     torch.manual_seed(4896)
 
-    global access_token, pipe
-    if token != "":
-        access_token = token
-
     pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
+        local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
-        use_auth_token=access_token,
     ).to("cuda")
 
     # CLIP
-    benchmark_clip(batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify)
+    benchmark_clip(
+        pipe.text_encoder,
+        batch_size=batch_size,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
+    )
     # UNet
-    benchmark_unet(batch_size=batch_size * 2, benchmark_pt=benchmark_pt, verify=verify)
+    benchmark_unet(
+        pipe.unet,
+        batch_size=batch_size * 2,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
+        hidden_dim=pipe.text_encoder.config.hidden_size,
+    )
     # VAE
-    benchmark_vae(batch_size=batch_size, benchmark_pt=benchmark_pt, verify=verify)
+    benchmark_vae(
+        pipe.vae,
+        batch_size=batch_size,
+        benchmark_pt=benchmark_pt,
+        verify=verify,
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/05_stable_diffusion/benchmark_pt.py b/examples/05_stable_diffusion/src/benchmark_pt.py
similarity index 64%
rename from examples/05_stable_diffusion/benchmark_pt.py
rename to examples/05_stable_diffusion/src/benchmark_pt.py
index aa9af8596..c12877897 100644
--- a/examples/05_stable_diffusion/benchmark_pt.py
+++ b/examples/05_stable_diffusion/src/benchmark_pt.py
@@ -21,24 +21,29 @@
 
 
 @click.command()
-@click.option("--token", default="", help="access token")
+@click.option(
+    "--local-dir",
+    default="./tmp/diffusers-pipeline/stabilityai/stable-diffusion-v2",
+    help="the local diffusers pipeline directory",
+)
+@click.option("--width", default=512, help="Width of generated image")
+@click.option("--height", default=512, help="Height of generated image")
 @click.option("--prompt", default="A vision of paradise, Unreal Engine", help="prompt")
+@click.option("--negative_prompt", default="", help="prompt")
 @click.option(
     "--benchmark", type=bool, default=False, help="run stable diffusion e2e benchmark"
 )
-def run(token, prompt, benchmark):
+def run(local_dir, width, height, prompt, negative_prompt, benchmark):
     pipe = StableDiffusionPipeline.from_pretrained(
-        "stabilityai/stable-diffusion-2",
+        local_dir,
         revision="fp16",
         torch_dtype=torch.float16,
-        use_auth_token=token,
     ).to("cuda")
 
-    with torch.autocast("cuda"):
-        image = pipe(prompt).images[0]
-        if benchmark:
-            t = benchmark_torch_function(10, pipe, prompt)
-            print(f"sd pt e2e: {t} ms")
+    image = pipe(prompt, height, width, negative_prompt=negative_prompt).images[0]
+    if benchmark:
+        t = benchmark_torch_function(10, pipe, prompt)
+        print(f"sd pt e2e: {t} ms")
 
     image.save("example_pt.png")
 
diff --git a/examples/05_stable_diffusion/src/compile_lib/__init__.py b/examples/05_stable_diffusion/src/compile_lib/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
new file mode 100644
index 000000000..a1db0ac01
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
+from .util import mark_output
+import torch
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def map_clip_params(pt_mod):
+    params_ait = {}
+    pt_params = dict(pt_mod.named_parameters())
+    for key, arr in pt_params.items():
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif USE_CUDA:
+            if "q_proj" in name:
+                ait_name = ait_name.replace("q_proj", "proj_q")
+            elif "k_proj" in name:
+                ait_name = ait_name.replace("k_proj", "proj_k")
+            elif "v_proj" in name:
+                ait_name = ait_name.replace("v_proj", "proj_v")
+        else:
+            if name.endswith("q_proj.weight"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.weight")]
+                q = pt_params[prefix + "q_proj.weight"]
+                k = pt_params[prefix + "k_proj.weight"]
+                v = pt_params[prefix + "v_proj.weight"]
+                qkv_weight = torch.cat([q, k, v], dim=0)
+                params_ait[ait_name] = qkv_weight
+                continue
+            elif name.endswith("q_proj.bias"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.bias")]
+                q = pt_params[prefix + "q_proj.bias"]
+                k = pt_params[prefix + "k_proj.bias"]
+                v = pt_params[prefix + "v_proj.bias"]
+                qkv_bias = torch.cat([q, k, v], dim=0)
+                params_ait[ait_name] = qkv_bias
+                continue
+            elif name.endswith("k_proj.weight"):
+                continue
+            elif name.endswith("k_proj.bias"):
+                continue
+            elif name.endswith("v_proj.weight"):
+                continue
+            elif name.endswith("v_proj.bias"):
+                continue
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+def compile_clip(
+    pt_mod,
+    batch_size=1,
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    depth=12,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    act_layer="gelu",
+):
+    mask_seq = 0
+    causal = True
+
+    ait_mod = ait_CLIPTextTransformer(
+        num_hidden_layers=depth,
+        hidden_size=dim,
+        num_attention_heads=num_heads,
+        batch_size=batch_size,
+        seq_len=seqlen,
+        causal=causal,
+        mask_seq=mask_seq,
+        act_layer=act_layer,
+    )
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip_params(pt_mod)
+    batch_size_d = IntVar(values=[1, max(8, batch_size)], name="batch_size") if USE_CUDA else batch_size
+
+    input_ids_ait = Tensor(
+        [batch_size_d, seqlen], name="input0", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size_d, seqlen], name="input1", dtype="int64", is_input=True
+    )
+    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", "CLIPTextModel", constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
new file mode 100644
index 000000000..c7b5ca4e5
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_clip_alt.py
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.clip import CLIPTextTransformer as ait_CLIPTextTransformer
+from .util import torch_dtype_from_str
+import torch
+
+USE_CUDA = detect_target().name() == "cuda"
+
+def map_clip(pt_mod, device="cuda", dtype="float16"):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif USE_CUDA:
+            if "q_proj" in name:
+                ait_name = ait_name.replace("q_proj", "proj_q")
+            elif "k_proj" in name:
+                ait_name = ait_name.replace("k_proj", "proj_k")
+            elif "v_proj" in name:
+                ait_name = ait_name.replace("v_proj", "proj_v")
+        else:
+            if name.endswith("q_proj.weight"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.weight")]
+                q = pt_params[prefix + "q_proj.weight"]
+                k = pt_params[prefix + "k_proj.weight"]
+                v = pt_params[prefix + "v_proj.weight"]
+                qkv_weight = torch.cat([q, k, v], dim=0).cuda()
+                params_ait[ait_name] = qkv_weight
+                continue
+            elif name.endswith("q_proj.bias"):
+                ait_name = ait_name.replace("q_proj", "qkv")
+                prefix = key[: -len("q_proj.bias")]
+                q = pt_params[prefix + "q_proj.bias"]
+                k = pt_params[prefix + "k_proj.bias"]
+                v = pt_params[prefix + "v_proj.bias"]
+                qkv_bias = torch.cat([q, k, v], dim=0).cuda()
+                params_ait[ait_name] = qkv_bias
+                continue
+            elif name.endswith("k_proj.weight"):
+                continue
+            elif name.endswith("k_proj.bias"):
+                continue
+            elif name.endswith("v_proj.weight"):
+                continue
+            elif name.endswith("v_proj.bias"):
+                continue
+        # elif "q_proj" in name:
+        #     ait_name = ait_name.replace("q_proj", "proj_q")
+        # elif "k_proj" in name:
+        #     ait_name = ait_name.replace("k_proj", "proj_k")
+        # elif "v_proj" in name:
+        #     ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+    return params_ait
+
+
+def compile_clip(
+    pt_mod,
+    batch_size=(1, 8),
+    seqlen=64,
+    dim=768,
+    num_heads=12,
+    depth=12,
+    output_hidden_states=False,
+    text_projection_dim=None,
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    act_layer="gelu",
+    constants=True,
+    model_name="CLIPTextModel",
+    work_dir="./tmp",
+):
+    mask_seq = 0
+    causal = True
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_clip(pt_mod)
+
+    static_shape = batch_size[0] == batch_size[1]
+    if static_shape:
+        batch_size = batch_size[0]
+    else:
+        batch_size = IntVar(values=list(batch_size), name="batch_size")
+
+    input_ids_ait = Tensor(
+        [batch_size, seqlen], name="input_ids", dtype="int64", is_input=True
+    )
+    position_ids_ait = Tensor(
+        [batch_size, seqlen], name="position_ids", dtype="int64", is_input=True
+    )
+
+    ait_mod = ait_CLIPTextTransformer(
+        num_hidden_layers=depth,
+        hidden_size=dim,
+        num_attention_heads=num_heads,
+        batch_size=batch_size,
+        seq_len=seqlen,
+        causal=causal,
+        mask_seq=mask_seq,
+        act_layer=act_layer,
+        output_hidden_states=output_hidden_states,
+        text_projection_dim=text_projection_dim,
+    )
+    ait_mod.name_parameter_tensor()
+    
+    Y = ait_mod(input_ids=input_ids_ait, position_ids=position_ids_ait)
+    for out in Y:
+        shape = [d._attrs["values"] for d in out._attrs["shape"]]
+        print(f'AIT {out._attrs["name"]} shape: {shape}')
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
+    compile_model(
+        Y,
+        target,
+        work_dir,
+        model_name,
+        constants=params_ait if constants else None,
+        dll_name=dll_name,
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py b/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py
new file mode 100644
index 000000000..927f93cd0
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_controlnet.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.controlnet_unet_2d_condition import (
+    ControlNetModel as ait_ControlNetModel,
+)
+from .util import mark_output
+
+
+def map_controlnet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+    params_ait["controlnet_cond_embedding_conv_in_weight"] = torch.nn.functional.pad(
+        params_ait["controlnet_cond_embedding_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+    )
+    params_ait["arange"] = (
+        torch.arange(start=0, end=320 // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+# was used to debug controlnet_cond_embedding
+# def compile_controlnet_conditioning_embedding():
+#     controlnet_cond_embedding = ait_ControlNetConditioningEmbedding(256)
+#     controlnet_cond_embedding.name_parameter_tensor()
+#     controlnet_condition_ait = Tensor(
+#         [2, 512, 512, 3], name="input0", is_input=True
+#     )
+#     Y = controlnet_cond_embedding(controlnet_condition_ait)
+#     mark_output(Y)
+#     target = detect_target(
+#         use_fp16_acc=True, convert_conv_to_gemm=True
+#     )
+#     compile_model(Y, target, "./tmp", "ControlNetConditioningEmbedding", constants=None)
+
+
+def compile_controlnet(
+    pt_mod,
+    batch_size=2,
+    height=512,  # (512,1024),
+    width=512,  # (512,1024),
+    clip_chunks=1,
+    dim=320,
+    hidden_dim=768,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    model_name="ControlNetModel",
+    constants=False,
+):
+    batch_size = batch_size * 2  # double batch size for unet
+    ait_mod = ait_ControlNetModel()
+    ait_mod.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_controlnet_params(pt_mod, dim)
+    # batch_size = (batch_size[0], batch_size[1] * 2) #double batch size for unet
+    clip_batch_size = IntVar(values=(1, 8), name="batch_size")
+    # height_d = IntVar(values=list((height[0]//8, height[1]//8)), name="height_d")
+    # width_d = IntVar(values=list((width[0]//8, width[1]//8)), name="width_d")
+    # height_c = IntVar(values=list((height[0], height[1])), name="height_c")
+    # width_c = IntVar(values=list((width[0], width[1])), name="width_c")
+    clip_chunks = 77, 77 * clip_chunks
+    embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height // 8, width // 8, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [clip_batch_size, embedding_size, hidden_dim], name="input2", is_input=True
+    )
+    controlnet_condition_ait = Tensor(
+        [batch_size, height, width, 3], name="input3", is_input=True
+    )
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        controlnet_condition_ait,
+    )
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y, target, "./tmp", model_name, constants=params_ait if constants else None
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
new file mode 100644
index 000000000..091d1db18
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.unet_2d_condition import (
+    UNet2DConditionModel as ait_UNet2DConditionModel,
+)
+from .util import mark_output
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def map_unet_params(pt_mod, dim):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def compile_unet(
+    pt_mod,
+    batch_size=2,
+    height=64,
+    width=64,
+    dim=320,
+    hidden_dim=1024,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    model_name="UNet2DConditionModel",
+    use_linear_projection=False,
+):
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=64,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=attention_head_dim,
+        use_linear_projection=use_linear_projection,
+    )
+    ait_mod.name_parameter_tensor()
+
+    # set AIT parameters
+    pt_mod = pt_mod.eval()
+    params_ait = map_unet_params(pt_mod, dim)
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height = IntVar(values=[32, 64], name="height") if USE_CUDA else height
+    width = IntVar(values=[32, 64], name="width") if USE_CUDA else width
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height, width, 4], name="input0", is_input=True
+    )
+    timesteps_ait = Tensor([batch_size], name="input1", is_input=True)
+    text_embeddings_pt_ait = Tensor(
+        [batch_size, 77, hidden_dim], name="input2", is_input=True
+    )
+
+    mid_block_additional_residual = None
+    down_block_additional_residuals = None
+
+    Y = ait_mod(
+        latent_model_input_ait,
+        timesteps_ait,
+        text_embeddings_pt_ait,
+        down_block_additional_residuals,
+        mid_block_additional_residual,
+    )
+    mark_output(Y)
+
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(Y, target, "./tmp", model_name, constants=params_ait)
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
new file mode 100644
index 000000000..6b319d0bc
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_unet_alt.py
@@ -0,0 +1,377 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.embeddings import Timesteps
+from ..modeling.unet_2d_condition import (
+    UNet2DConditionModel as ait_UNet2DConditionModel,
+)
+from .util import torch_dtype_from_str
+
+
+def map_unet(
+    pt_mod, in_channels=None, conv_in_key=None, dim=320, device="cuda", dtype="float16"
+):
+    if in_channels is not None and conv_in_key is None:
+        raise ValueError(
+            "conv_in_key must be specified if in_channels is not None for padding"
+        )
+    if not isinstance(pt_mod, dict):
+        pt_params = dict(pt_mod.named_parameters())
+    else:
+        pt_params = pt_mod
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if key.startswith("model.diffusion_model."):
+            key = key.replace("model.diffusion_model.", "")
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    if conv_in_key is not None:
+        if in_channels % 4 != 0:
+            pad_by = 4 - (in_channels % 4)
+            params_ait[conv_in_key] = torch.functional.F.pad(
+                params_ait[conv_in_key], (0, pad_by)
+            )
+
+    params_ait["arange"] = torch.arange(start=0, end=dim // 2, dtype=torch.float32).to(
+        device, dtype=torch_dtype_from_str(dtype)
+    )
+    return params_ait
+
+
+def compile_timestep_embedder(
+    dim=256,
+    flip_sin_to_cos=True,
+    downscale_freq_shift=0,
+    work_dir="./tmp",
+    model_name="Timesteps",
+):
+    timesteps = Timesteps(
+        dim, flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=downscale_freq_shift
+    )
+
+    timestep = Tensor([1], name="timestep", is_input=True)
+
+    Y = timesteps(timestep)
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "time_embed"
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
+    constants = {"arange": torch.arange(start=0, end=dim // 2, dtype=torch.float16)}
+
+    target = detect_target(use_fp16_acc=True, convert_conv_to_gemm=True)
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
+    compile_model(Y, target, work_dir, model_name, constants=constants, dll_name=dll_name)
+
+
+def compile_unet(
+    pt_mod,
+    batch_size=(1, 8),
+    height=(64, 2048),
+    width=(64, 2048),
+    clip_chunks=1,
+    work_dir="./tmp",
+    dim=320,
+    hidden_dim=1024,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    controlnet=False,
+    attention_head_dim=[5, 10, 20, 20],  # noqa: B006
+    model_name="UNet2DConditionModel",
+    use_linear_projection=False,
+    constants=True,
+    block_out_channels=(320, 640, 1280, 1280),
+    down_block_types=(
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    ),
+    up_block_types=(
+        "UpBlock2D",
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+        "CrossAttnUpBlock2D",
+    ),
+    in_channels=4,
+    out_channels=4,
+    sample_size=64,
+    class_embed_type=None,
+    num_class_embeds=None,
+    only_cross_attention=[True, True, True, False],
+    down_factor=8,
+    time_embedding_dim=None,
+    conv_in_kernel: int = 3,
+    projection_class_embeddings_input_dim=None,
+    addition_embed_type=None,
+    transformer_layers_per_block=[1, 1, 1, 1],
+    dtype="float16",
+):
+    xl = False
+    if projection_class_embeddings_input_dim is not None:
+        xl = True
+    if isinstance(only_cross_attention, bool):
+        only_cross_attention = [only_cross_attention] * len(block_out_channels)
+    if isinstance(transformer_layers_per_block, int):
+        transformer_layers_per_block = [transformer_layers_per_block] * len(
+            down_block_types
+        )
+    if isinstance(attention_head_dim, int):
+        attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+    ait_mod = ait_UNet2DConditionModel(
+        sample_size=sample_size,
+        cross_attention_dim=hidden_dim,
+        attention_head_dim=attention_head_dim,
+        use_linear_projection=use_linear_projection,
+        up_block_types=up_block_types,
+        down_block_types=down_block_types,
+        block_out_channels=block_out_channels,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        class_embed_type=class_embed_type,
+        num_class_embeds=num_class_embeds,
+        only_cross_attention=only_cross_attention,
+        time_embedding_dim=time_embedding_dim,
+        conv_in_kernel=conv_in_kernel,
+        projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+        addition_embed_type=addition_embed_type,
+        transformer_layers_per_block=transformer_layers_per_block,
+        dtype=dtype,
+    )
+    ait_mod.name_parameter_tensor()
+
+    # set AIT parameters
+    pt_mod = pt_mod.eval()
+    params_ait = map_unet(
+        pt_mod,
+        dim=dim,
+        in_channels=in_channels,
+        conv_in_key="conv_in_weight",
+        dtype=dtype,
+    )
+
+    static_shape = width[0] == width[1] and height[0] == height[1]
+
+    if static_shape:
+        height = height[0] // down_factor
+        width = width[0] // down_factor
+        height_d = height
+        width_d = width
+        height_1_d = height
+        width_1_d = width
+        height_2 = height // 2
+        width_2 = width // 2
+        height_4 = height // 4
+        width_4 = width // 4
+        height_8 = height // 8
+        width_8 = width // 8
+        height_2_d = height_2
+        width_2_d = width_2
+        height_4_d = height_4
+        width_4_d = width_4
+        height_8_d = height_8
+        width_8_d = width_8
+    else:
+        height = [x // down_factor for x in height]
+        width = [x // down_factor for x in width]
+        height_d = IntVar(values=list(height), name="height_d")
+        width_d = IntVar(values=list(width), name="width_d")
+        height_1_d = IntVar(values=list(height), name="height_1_d")
+        width_1_d = IntVar(values=list(width), name="width_1_d")
+        height_2 = [x // 2 for x in height]
+        width_2 = [x // 2 for x in width]
+        height_4 = [x // 4 for x in height]
+        width_4 = [x // 4 for x in width]
+        height_8 = [x // 8 for x in height]
+        width_8 = [x // 8 for x in width]
+        height_2_d = IntVar(values=list(height_2), name="height_2_d")
+        width_2_d = IntVar(values=list(width_2), name="width_2_d")
+        height_4_d = IntVar(values=list(height_4), name="height_4_d")
+        width_4_d = IntVar(values=list(width_4), name="width_4_d")
+        height_8_d = IntVar(values=list(height_8), name="height_8_d")
+        width_8_d = IntVar(values=list(width_8), name="width_8_d")
+
+    batch_size = batch_size[0], batch_size[1] * 2  # double batch size for unet
+    batch_size = IntVar(values=list(batch_size), name="batch_size") if detect_target().name() == "cuda" else 2
+
+    if static_shape:
+        embedding_size = 77
+    else:
+        clip_chunks = 77, 77 * clip_chunks
+        embedding_size = IntVar(values=list(clip_chunks), name="embedding_size")
+
+    latent_model_input_ait = Tensor(
+        [batch_size, height_d, width_d, in_channels],
+        name="latent_model_input",
+        is_input=True,
+        dtype=dtype,
+    )
+    timesteps_ait = Tensor([batch_size], name="timesteps", is_input=True, dtype=dtype)
+    text_embeddings_pt_ait = Tensor(
+        [batch_size, embedding_size, hidden_dim],
+        name="encoder_hidden_states",
+        is_input=True,
+        dtype=dtype,
+    )
+
+    class_labels = None
+    # TODO: better way to handle this, enables class_labels for x4-upscaler
+    if in_channels == 7:
+        class_labels = Tensor(
+            [batch_size], name="class_labels", dtype="int64", is_input=True
+        )
+
+    add_embeds = None
+    if xl:
+        add_embeds = Tensor(
+            [batch_size, projection_class_embeddings_input_dim],
+            name="add_embeds",
+            is_input=True,
+            dtype=dtype,
+        )
+
+    down_block_residual_0 = None
+    down_block_residual_1 = None
+    down_block_residual_2 = None
+    down_block_residual_3 = None
+    down_block_residual_4 = None
+    down_block_residual_5 = None
+    down_block_residual_6 = None
+    down_block_residual_7 = None
+    down_block_residual_8 = None
+    down_block_residual_9 = None
+    down_block_residual_10 = None
+    down_block_residual_11 = None
+    mid_block_residual = None
+    if controlnet:
+        down_block_residual_0 = Tensor(
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
+            name="down_block_residual_0",
+            is_input=True,
+        )
+        down_block_residual_1 = Tensor(
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
+            name="down_block_residual_1",
+            is_input=True,
+        )
+        down_block_residual_2 = Tensor(
+            [batch_size, height_1_d, width_1_d, block_out_channels[0]],
+            name="down_block_residual_2",
+            is_input=True,
+        )
+        down_block_residual_3 = Tensor(
+            [batch_size, height_2_d, width_2_d, block_out_channels[0]],
+            name="down_block_residual_3",
+            is_input=True,
+        )
+        down_block_residual_4 = Tensor(
+            [batch_size, height_2_d, width_2_d, block_out_channels[1]],
+            name="down_block_residual_4",
+            is_input=True,
+        )
+        down_block_residual_5 = Tensor(
+            [batch_size, height_2_d, width_2_d, block_out_channels[1]],
+            name="down_block_residual_5",
+            is_input=True,
+        )
+        down_block_residual_6 = Tensor(
+            [batch_size, height_4_d, width_4_d, block_out_channels[1]],
+            name="down_block_residual_6",
+            is_input=True,
+        )
+        down_block_residual_7 = Tensor(
+            [batch_size, height_4_d, width_4_d, block_out_channels[2]],
+            name="down_block_residual_7",
+            is_input=True,
+        )
+        down_block_residual_8 = Tensor(
+            [batch_size, height_4_d, width_4_d, block_out_channels[2]],
+            name="down_block_residual_8",
+            is_input=True,
+        )
+        down_block_residual_9 = Tensor(
+            [batch_size, height_8_d, width_8_d, block_out_channels[2]],
+            name="down_block_residual_9",
+            is_input=True,
+        )
+        down_block_residual_10 = Tensor(
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
+            name="down_block_residual_10",
+            is_input=True,
+        )
+        down_block_residual_11 = Tensor(
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
+            name="down_block_residual_11",
+            is_input=True,
+        )
+        mid_block_residual = Tensor(
+            [batch_size, height_8_d, width_8_d, block_out_channels[3]],
+            name="mid_block_residual",
+            is_input=True,
+        )
+
+    Y = ait_mod(
+        sample=latent_model_input_ait,
+        timesteps=timesteps_ait,
+        encoder_hidden_states=text_embeddings_pt_ait,
+        down_block_residual_0=down_block_residual_0,
+        down_block_residual_1=down_block_residual_1,
+        down_block_residual_2=down_block_residual_2,
+        down_block_residual_3=down_block_residual_3,
+        down_block_residual_4=down_block_residual_4,
+        down_block_residual_5=down_block_residual_5,
+        down_block_residual_6=down_block_residual_6,
+        down_block_residual_7=down_block_residual_7,
+        down_block_residual_8=down_block_residual_8,
+        down_block_residual_9=down_block_residual_9,
+        down_block_residual_10=down_block_residual_10,
+        down_block_residual_11=down_block_residual_11,
+        mid_block_residual=mid_block_residual,
+        class_labels=class_labels,
+        add_embeds=add_embeds,
+    )
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
+    compile_model(
+        Y,
+        target,
+        work_dir,
+        model_name,
+        constants=params_ait if constants else None,
+        dll_name=dll_name,
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
new file mode 100644
index 000000000..6f1095f90
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae.py
@@ -0,0 +1,191 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
+from .util import mark_output
+
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
+
+
+def map_vae(pt_module, device="cuda", dtype="float16"):
+    if not isinstance(pt_module, dict):
+        pt_params = dict(pt_module.named_parameters())
+    else:
+        pt_params = pt_module
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if key.startswith("encoder"):
+            continue
+        if key.startswith("quant"):
+            continue
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        key = key.replace(".", "_")
+        if (
+            "conv" in key
+            and "norm" not in key
+            and key.endswith("_weight")
+            and len(arr.shape) == 4
+        ):
+            params_ait[key] = torch.permute(arr, [0, 2, 3, 1]).contiguous()
+        elif key.endswith("proj_attn_weight"):
+            prefix = key[: -len("proj_attn_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_weight"):
+            prefix = key[: -len("to_out_0_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("proj_attn_bias"):
+            prefix = key[: -len("proj_attn_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_bias"):
+            prefix = key[: -len("to_out_0_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("query_weight"):
+            prefix = key[: -len("query_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_q_weight"):
+            prefix = key[: -len("to_q_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("query_bias"):
+            prefix = key[: -len("query_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_q_bias"):
+            prefix = key[: -len("to_q_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("key_weight"):
+            prefix = key[: -len("key_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("key_bias"):
+            prefix = key[: -len("key_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("value_weight"):
+            prefix = key[: -len("value_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("value_bias"):
+            prefix = key[: -len("value_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_k_weight"):
+            prefix = key[: -len("to_k_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_v_weight"):
+            prefix = key[: -len("to_v_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_k_bias"):
+            prefix = key[: -len("to_k_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_v_bias"):
+            prefix = key[: -len("to_v_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        else:
+            params_ait[key] = arr
+
+    return params_ait
+
+
+def compile_vae(
+    pt_mod,
+    batch_size=1,
+    height=64,
+    width=64,
+    use_fp16_acc=False,
+    convert_conv_to_gemm=False,
+    name="AutoencoderKL",
+):
+    in_channels = 3
+    out_channels = 3
+    down_block_types = [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ]
+    up_block_types = [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ]
+    block_out_channels = [128, 256, 512, 512]
+    layers_per_block = 2
+    act_fn = "silu"
+    latent_channels = 4
+    sample_size = 512
+
+    ait_vae = ait_AutoencoderKL(
+        batch_size,
+        height,
+        width,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn=act_fn,
+        latent_channels=latent_channels,
+        sample_size=sample_size,
+    )
+    # batch_size = IntVar(values=[1, 8], name="batch_size")
+    height = IntVar(values=[32, 64], name="height") if USE_CUDA else height
+    width = IntVar(values=[32, 64], name="width") if USE_CUDA else width
+
+    ait_input = Tensor(
+        shape=[batch_size, height, width, latent_channels],
+        name="vae_input",
+        is_input=True,
+    )
+    ait_vae.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_vae(pt_mod)
+
+    Y = ait_vae.decode(ait_input)
+    mark_output(Y)
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    compile_model(
+        Y,
+        target,
+        "./tmp",
+        name,
+        constants=params_ait,
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
new file mode 100644
index 000000000..6f0147e06
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/compile_vae_alt.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import sys
+
+import torch
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+
+from ..modeling.vae import AutoencoderKL as ait_AutoencoderKL
+
+
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
+
+
+def map_vae(pt_module, device="cuda", dtype="float16", encoder=False):
+    if not isinstance(pt_module, dict):
+        pt_params = dict(pt_module.named_parameters())
+    else:
+        pt_params = pt_module
+    params_ait = {}
+    quant_key = "post_quant" if encoder else "quant"
+    vae_key = "decoder" if encoder else "encoder"
+    for key, arr in pt_params.items():
+        if key.startswith(vae_key):
+            continue
+        if key.startswith(quant_key):
+            continue
+        arr = arr.to(device, dtype=torch_dtype_from_str(dtype))
+        key = key.replace(".", "_")
+        if (
+            "conv" in key
+            and "norm" not in key
+            and key.endswith("_weight")
+            and len(arr.shape) == 4
+        ):
+            params_ait[key] = torch.permute(arr, [0, 2, 3, 1]).contiguous()
+        elif key.endswith("proj_attn_weight"):
+            prefix = key[: -len("proj_attn_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_weight"):
+            prefix = key[: -len("to_out_0_weight")]
+            key = prefix + "attention_proj_weight"
+            params_ait[key] = arr
+        elif key.endswith("proj_attn_bias"):
+            prefix = key[: -len("proj_attn_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_out_0_bias"):
+            prefix = key[: -len("to_out_0_bias")]
+            key = prefix + "attention_proj_bias"
+            params_ait[key] = arr
+        elif key.endswith("query_weight"):
+            prefix = key[: -len("query_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_q_weight"):
+            prefix = key[: -len("to_q_weight")]
+            key = prefix + "attention_proj_q_weight"
+            params_ait[key] = arr
+        elif key.endswith("query_bias"):
+            prefix = key[: -len("query_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_q_bias"):
+            prefix = key[: -len("to_q_bias")]
+            key = prefix + "attention_proj_q_bias"
+            params_ait[key] = arr
+        elif key.endswith("key_weight"):
+            prefix = key[: -len("key_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("key_bias"):
+            prefix = key[: -len("key_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("value_weight"):
+            prefix = key[: -len("value_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("value_bias"):
+            prefix = key[: -len("value_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_k_weight"):
+            prefix = key[: -len("to_k_weight")]
+            key = prefix + "attention_proj_k_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_v_weight"):
+            prefix = key[: -len("to_v_weight")]
+            key = prefix + "attention_proj_v_weight"
+            params_ait[key] = arr
+        elif key.endswith("to_k_bias"):
+            prefix = key[: -len("to_k_bias")]
+            key = prefix + "attention_proj_k_bias"
+            params_ait[key] = arr
+        elif key.endswith("to_v_bias"):
+            prefix = key[: -len("to_v_bias")]
+            key = prefix + "attention_proj_v_bias"
+            params_ait[key] = arr
+        else:
+            params_ait[key] = arr
+    if encoder:
+        params_ait["encoder_conv_in_weight"] = torch.functional.F.pad(
+            params_ait["encoder_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+        )
+
+    return params_ait
+
+
+def compile_vae(
+    pt_mod,
+    batch_size=(1, 8),
+    height=(64, 2048),
+    width=(64, 2048),
+    use_fp16_acc=True,
+    convert_conv_to_gemm=True,
+    model_name="AutoencoderKL",
+    constants=True,
+    block_out_channels=[128, 256, 512, 512],
+    layers_per_block=2,
+    act_fn="silu",
+    latent_channels=4,
+    sample_size=512,
+    in_channels=3,
+    out_channels=3,
+    down_block_types=[
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ],
+    up_block_types=[
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ],
+    input_size=(64, 64),
+    down_factor=8,
+    dtype="float16",
+    work_dir="./tmp",
+    vae_encode=False,
+):
+    ait_vae = ait_AutoencoderKL(
+        batch_size[0],
+        input_size[0],
+        input_size[1],
+        in_channels=in_channels,
+        out_channels=out_channels,
+        down_block_types=down_block_types,
+        up_block_types=up_block_types,
+        block_out_channels=block_out_channels,
+        layers_per_block=layers_per_block,
+        act_fn=act_fn,
+        latent_channels=latent_channels,
+        sample_size=sample_size,
+        dtype=dtype,
+    )
+
+    static_batch = batch_size[0] == batch_size[1]
+    static_shape = height[0] == height[1] and width[0] == width[1]
+    if not vae_encode:
+        height = height[0] // down_factor, height[1] // down_factor
+        width = width[0] // down_factor, width[1] // down_factor
+
+    if static_batch:
+        batch_size = batch_size[0]
+    else:
+        batch_size = IntVar(values=list(batch_size), name="batch_size")
+    if static_shape:
+        height_d = height[0]
+        width_d = width[0]
+    else:
+        height_d = IntVar(values=list(height), name="height")
+        width_d = IntVar(values=list(width), name="width")
+
+    ait_input = Tensor(
+        shape=[batch_size, height_d, width_d, 3 if vae_encode else latent_channels],
+        name="pixels" if vae_encode else "latent",
+        is_input=True,
+        dtype=dtype,
+    )
+    sample = None
+    if vae_encode:
+        sample = Tensor(
+            shape=[batch_size, height_d, width_d, latent_channels],
+            name="random_sample",
+            is_input=True,
+            dtype=dtype,
+        )
+    ait_vae.name_parameter_tensor()
+
+    pt_mod = pt_mod.eval()
+    params_ait = map_vae(pt_mod, dtype=dtype, encoder=vae_encode)
+    if vae_encode:
+        Y = ait_vae.encode(ait_input, sample)
+    else:
+        Y = ait_vae.decode(ait_input)
+    shape = [d._attrs["values"] for d in Y._attrs["shape"]]
+    print(f'AIT {Y._attrs["name"]} shape: {shape}')
+    target = detect_target(
+        use_fp16_acc=use_fp16_acc, convert_conv_to_gemm=convert_conv_to_gemm
+    )
+    dll_name = model_name + ".dll" if sys.platform == "win32" else model_name + ".so"
+    compile_model(
+        Y,
+        target,
+        work_dir,
+        model_name,
+        constants=params_ait if constants else None,
+        dll_name=dll_name,
+    )
diff --git a/examples/05_stable_diffusion/src/compile_lib/util.py b/examples/05_stable_diffusion/src/compile_lib/util.py
new file mode 100644
index 000000000..7c5e0b6aa
--- /dev/null
+++ b/examples/05_stable_diffusion/src/compile_lib/util.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+
+def mark_output(y):
+    if type(y) is not tuple:
+        y = (y,)
+    for i in range(len(y)):
+        y[i]._attrs["is_output"] = True
+        y[i]._attrs["name"] = "output_%d" % (i)
+        y_shape = [d._attrs["values"] for d in y[i]._attrs["shape"]]
+        print("AIT output_{} shape: {}".format(i, y_shape))
+
+
+def torch_dtype_from_str(dtype: str):
+    return torch.__dict__.get(dtype, None)
diff --git a/examples/05_stable_diffusion/src/inference_ait.py b/examples/05_stable_diffusion/src/inference_ait.py
new file mode 100644
index 000000000..462712255
--- /dev/null
+++ b/examples/05_stable_diffusion/src/inference_ait.py
@@ -0,0 +1,233 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Helpers for inference using dict[str, torch.Tensor] as inputs and outputs.
+input names are manually specified, the same as set in compilation scripts.
+output names are taken from the model itself.
+usage:
+outputs = clip_inference(...)
+#Diffusers/Transformers
+pooled_prompt_embeds = outputs[0]
+prompt_embeds = prompt_embeds.hidden_states[-2]
+#AIT
+pooled_prompt_embeds = outputs["text_embeds"] # or "pooled_output" is without projection, "text_embeds" is with projection i.e. for bigG
+prompt_embeds = outputs["hidden_state_31"]
+usage:
+latent = unet_inference(...)['latent_output']
+usage:
+pixels = vae_decode_inference(...)['pixels']
+usage:
+latent = vae_encode_inference(...)['latent']
+"""
+from typing import Dict, List
+
+import torch
+from aitemplate.compiler import Model
+
+
+def inference(
+    module: Model,
+    inputs: Dict[str, torch.Tensor],
+    outputs: Dict[str, torch.Tensor],
+    benchmark: bool = False,
+    benchmark_count: int = 50,
+    benchmark_repeat: int = 4,
+    permute: bool = False,
+    to_cpu: bool = False,
+    graph_mode=False,
+    sync=True,
+):
+    module.run_with_tensors(inputs, outputs, graph_mode=graph_mode, sync=sync)
+    if permute:
+        for name, output in outputs.items():
+            if len(output.shape) == 4:
+                outputs[name] = output.permute((0, 3, 1, 2))
+    if to_cpu:
+        for name, output in outputs.items():
+            outputs[name] = output.cpu()
+    if benchmark:
+        t, _, _ = module.benchmark_with_tensors(
+            inputs=inputs,
+            outputs=outputs,
+            count=benchmark_count,
+            repeat=benchmark_repeat,
+        )
+        print(f"latency: {t} ms, it/s: {1000 / t}")
+
+    return outputs
+
+
+def get_outputs(module: Model, dims, device: str = "cuda", dtype: str = "float16"):
+    outputs = {}
+    map = module.get_output_name_to_index_map()
+    for name, idx in map.items():
+        shape = module.get_output_maximum_shape(idx)
+        for idx, dim in enumerate(dims):
+            shape[idx] = dim
+        output = torch.empty(shape).to(device)
+        if dtype == "float16":
+            output = output.half()
+        outputs[name] = output
+    return outputs
+
+
+def timestep_inference(
+    module: Model,
+    timestep: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+    graph_mode: bool = False,
+    sync: bool = True
+):
+    timestep = torch.tensor([timestep]).to(device)
+    inputs = {"timestep": timestep.to(device)}
+    if dtype == "float16":
+        for k, v in inputs.items():
+            inputs[k] = v.half()
+    dims = [1]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync)
+
+
+def clip_inference(
+    module: Model,
+    input_ids: torch.Tensor,
+    seqlen: int = 77,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+    sync: bool = True,
+):
+    batch = input_ids.shape[0]
+    input_ids = input_ids.to(device)
+    position_ids = torch.arange(seqlen).expand((batch, -1)).to(device)
+    inputs = {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+    }
+    dims = [batch]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(module, inputs, outputs, benchmark=benchmark, to_cpu=to_cpu, sync=sync)
+
+
+def unet_inference(
+    module: Model,
+    latent_model_input: torch.Tensor,
+    timesteps: torch.Tensor,
+    encoder_hidden_states: torch.Tensor,
+    class_labels: torch.Tensor = None,
+    down_block_residuals: List[torch.Tensor] = None,
+    mid_block_residual: torch.Tensor = None,
+    add_embeds: torch.Tensor = None,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    to_cpu: bool = False,
+    graph_mode: bool = False,
+    sync: bool = True,
+):
+    batch = latent_model_input.shape[0]
+    height, width = latent_model_input.shape[2], latent_model_input.shape[3]
+    timesteps = timesteps.expand(batch)
+    inputs = {
+        "latent_model_input": latent_model_input.permute((0, 2, 3, 1))
+        .contiguous()
+        .to(device),
+        "timesteps": timesteps.to(device),
+        "encoder_hidden_states": encoder_hidden_states.to(device),
+    }
+    if class_labels is not None:
+        inputs["class_labels"] = class_labels.contiguous().to(device)
+    if down_block_residuals is not None and mid_block_residual is not None:
+        for i, y in enumerate(down_block_residuals):
+            inputs[f"down_block_residual_{i}"] = (
+                y.permute((0, 2, 3, 1)).contiguous().to(device)
+            )
+        inputs["mid_block_residual"] = (
+            mid_block_residual.permute((0, 2, 3, 1)).contiguous().to(device)
+        )
+    if add_embeds is not None:
+        inputs["add_embeds"] = add_embeds.to(device)
+    if dtype == "float16":
+        for k, v in inputs.items():
+            if k == "class_labels":
+                continue
+            inputs[k] = v.half()
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync,
+    )
+
+
+def vae_decode_inference(
+    module: Model,
+    latent: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    factor: int = 8,
+    to_cpu: bool = False,
+    graph_mode=False,
+    sync: bool = True,
+):
+    batch = latent.shape[0]
+    height, width = latent.shape[2:]
+    height *= factor
+    width *= factor
+    latent = latent.permute((0, 2, 3, 1)).contiguous().to(device)
+    if dtype == "float16":
+        latent = latent.half()
+    inputs = {
+        "latent": latent,
+    }
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu, graph_mode=graph_mode, sync=sync
+    )
+
+
+def vae_encode_inference(
+    module: Model,
+    pixels: torch.Tensor,
+    device: str = "cuda",
+    dtype: str = "float16",
+    benchmark: bool = False,
+    factor: int = 8,
+    latent_channels: int = 4,
+    to_cpu: bool = False,
+):
+    batch = pixels.shape[0]
+    height, width = pixels.shape[2:]
+    height *= factor
+    width *= factor
+    pixels = pixels.permute((0, 2, 3, 1)).contiguous().to(device)
+    sample = torch.randn(batch, height, width, latent_channels).to(device)
+    if dtype == "float16":
+        pixels = pixels.half()
+        sample = sample.half()
+    inputs = {
+        "pixels": pixels,
+        "random_sample": sample,
+    }
+    dims = [batch, height, width]
+    outputs = get_outputs(module, dims, device, dtype)
+    return inference(
+        module, inputs, outputs, benchmark=benchmark, permute=True, to_cpu=to_cpu
+    )
diff --git a/examples/05_stable_diffusion/modeling/attention.py b/examples/05_stable_diffusion/src/modeling/attention.py
similarity index 76%
rename from examples/05_stable_diffusion/modeling/attention.py
rename to examples/05_stable_diffusion/src/modeling/attention.py
index 14993e6d9..59f6337a8 100644
--- a/examples/05_stable_diffusion/modeling/attention.py
+++ b/examples/05_stable_diffusion/src/modeling/attention.py
@@ -20,7 +20,6 @@
 from typing import Optional
 
 from aitemplate.compiler.ops import reshape
-
 from aitemplate.frontend import nn, Tensor
 
 
@@ -51,25 +50,23 @@ def __init__(
         num_groups: int = 32,
         rescale_output_factor: float = 1.0,
         eps: float = 1e-5,
+        dtype="float16",
     ):
         super().__init__()
         self.batch_size = batch_size
-        self.height = height
-        self.width = width
         self.channels = channels
         self.num_heads = (
             channels // num_head_channels if num_head_channels is not None else 1
         )
         self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_groups, channels, eps)
-        self.attention = nn.MultiheadAttention(
+        self.group_norm = nn.GroupNorm(num_groups, channels, eps, dtype=dtype)
+        self.attention = nn.CrossAttention(
             channels,
-            batch_size,
+            height * width,
             height * width,
             self.num_heads,
             qkv_bias=True,
-            has_residual=True,
-            use_mem_eff=True,
+            dtype=dtype,
         )
         self.rescale_output_factor = rescale_output_factor
 
@@ -78,28 +75,22 @@ def forward(self, hidden_states) -> Tensor:
         input hidden_states shape: [batch, height, width, channel]
         output shape: [batch, height, width, channel]
         """
+
         residual = hidden_states
 
         # norm
         hidden_states = self.group_norm(hidden_states)
+        o_shape = hidden_states.shape()
+        batch_dim = o_shape[0]
 
         hidden_states = reshape()(
-            hidden_states, [self.batch_size, self.height * self.width, self.channels]
+            hidden_states,
+            [batch_dim, -1, self.channels],
         )
 
-        batch, hw, channel = hidden_states.shape()
-        if (
-            batch.value() != self.batch_size
-            or hw.value() != self.width * self.height
-            or channel.value() != self.channels
-        ):
-            raise RuntimeError(
-                "nchw params do not match! "
-                f"Expected: {self.batch_size}, {self.channels}, {self.height} * {self.width}, "
-                f"actual: {batch}, {channel}, {hw}."
-            )
-
-        res = self.attention(hidden_states, residual) * (1 / self.rescale_output_factor)
-        res = reshape()(res, [self.batch_size, self.height, self.width, self.channels])
+        res = self.attention(hidden_states, hidden_states, hidden_states, residual) * (
+            1 / self.rescale_output_factor
+        )
 
+        res = reshape()(res, o_shape)
         return res
diff --git a/examples/05_stable_diffusion/modeling/clip.py b/examples/05_stable_diffusion/src/modeling/clip.py
similarity index 60%
rename from examples/05_stable_diffusion/modeling/clip.py
rename to examples/05_stable_diffusion/src/modeling/clip.py
index 2db29223e..4a0c9165d 100644
--- a/examples/05_stable_diffusion/modeling/clip.py
+++ b/examples/05_stable_diffusion/src/modeling/clip.py
@@ -57,33 +57,31 @@ def __init__(
         self.heads = heads
         self.dim_head = dim_head
 
-        self.to_q_weight = nn.Parameter(shape=[inner_dim, query_dim], dtype=dtype)
-        self.to_k_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
-        self.to_v_weight = nn.Parameter(shape=[inner_dim, context_dim], dtype=dtype)
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False, dtype=dtype)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False, dtype=dtype)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False, dtype=dtype)
         self.to_out = nn.Sequential(
-            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)
+            nn.Linear(inner_dim, query_dim, dtype=dtype),
+            nn.Dropout(dropout, dtype=dtype),
         )
 
     def forward(self, x, context=None, mask=None, residual=None):
         nheads = self.heads
         d = self.dim_head
 
-        layout = "20314" if USE_CUDA else "m2n3"
-
-        bs, seqlen, _ = get_shape(x)
-        q = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(x, [bs * seqlen, -1]), self.to_q_weight.tensor()
-        )
+        q = self.to_q(x)
         context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
 
-        seqlen = get_shape(context)[1]
-        k = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_k_weight.tensor()
-        )
-        v = ops.gemm_rcr_permute(shape=(seqlen, 1, nheads), layout=layout)(
-            ops.reshape()(context, [bs * seqlen, -1]), self.to_v_weight.tensor()
-        )
+        bs = x.shape()[0]
 
+        q = ops.reshape()(q, [bs, -1, nheads, d])
+        k = ops.reshape()(k, [bs, -1, nheads, d])
+        v = ops.reshape()(v, [bs, -1, nheads, d])
+        q = ops.permute()(q, [0, 2, 1, 3])
+        k = ops.permute()(k, [0, 2, 1, 3])
+        v = ops.permute()(v, [0, 2, 1, 3])
         if USE_CUDA:
             attn_op = ops.mem_eff_attention(causal=False)
             out = attn_op(
@@ -92,11 +90,15 @@ def forward(self, x, context=None, mask=None, residual=None):
                 (ops.reshape()(v, [bs, nheads, -1, d])),
             )
         else:
-            OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=self.scale)
-            out = OP(
-                (ops.reshape()(q, [bs * nheads, -1, d])),
-                (ops.reshape()(k, [bs * nheads, -1, d])),
-                (ops.reshape()(v, [bs * nheads, -1, d])),
+            attn_op = ops.bmm_softmax_bmm_permute(
+                shape=(nheads,),
+                scale=d**-0.5,
+                causal=False,
+            )
+            out = attn_op(
+                ops.reshape()(q, [bs * nheads, -1, d]),
+                ops.reshape()(k, [bs * nheads, -1, d]),
+                ops.reshape()(v, [bs * nheads, -1, d])
             )
         out = ops.reshape()(out, [bs, -1, nheads * d])
         proj = self.to_out(out)
@@ -108,30 +110,34 @@ def forward(self, x, context=None, mask=None, residual=None):
 
 
 class GEGLU(nn.Module):
-    def __init__(self, dim_in, dim_out):
+    def __init__(self, dim_in, dim_out, dtype="float16"):
         super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, specialization="mul")
-        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu")
+        self.proj = nn.Linear(dim_in, dim_out, specialization="mul", dtype=dtype)
+        self.gate = nn.Linear(dim_in, dim_out, specialization="fast_gelu", dtype=dtype)
 
     def forward(self, x):
         return self.proj(x, self.gate(x))
 
 
 class FeedForward(nn.Module):
-    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0):
+    def __init__(
+        self, dim, dim_out=None, mult=4, glu=False, dropout=0.0, dtype="float16"
+    ):
         super().__init__()
         inner_dim = int(dim * mult)
         dim_out = default(dim_out, dim)
         project_in = (
             nn.Sequential(
-                nn.Linear(dim, inner_dim, specialization="fast_gelu"),
+                nn.Linear(dim, inner_dim, specialization="fast_gelu", dtype=dtype),
             )
             if not glu
-            else GEGLU(dim, inner_dim)
+            else GEGLU(dim, inner_dim, dtype=dtype)
         )
 
         self.net = nn.Sequential(
-            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+            project_in,
+            nn.Dropout(dropout, dtype=dtype),
+            nn.Linear(inner_dim, dim_out, dtype=dtype),
         )
 
     def forward(self, x, residual=None):
@@ -154,35 +160,54 @@ def __init__(
         context_dim=None,
         gated_ff=True,
         checkpoint=True,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
+        self.only_cross_attention = only_cross_attention
         self.attn1 = CrossAttention(
-            query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout
-        )  # is a self-attention
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
-        self.attn2 = CrossAttention(
             query_dim=dim,
-            context_dim=context_dim,
+            context_dim=context_dim if only_cross_attention else None,
             heads=n_heads,
             dim_head=d_head,
             dropout=dropout,
+            dtype=dtype,
         )
-        self.norm1 = nn.LayerNorm(dim)
-        self.norm2 = nn.LayerNorm(dim)
-        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, dtype=dtype)
+        if context_dim is not None:
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                context_dim=context_dim,
+                heads=n_heads,
+                dim_head=d_head,
+                dropout=dropout,
+                dtype=dtype,
+            )
+        else:
+            self.attn2 = None
+        self.norm1 = nn.LayerNorm(dim, dtype=dtype)
+        self.norm2 = nn.LayerNorm(dim, dtype=dtype)
+        self.norm3 = nn.LayerNorm(dim, dtype=dtype)
         self.checkpoint = checkpoint
 
         self.param = (dim, n_heads, d_head, context_dim, gated_ff, checkpoint)
 
     def forward(self, x, context=None):
-        x = self.attn1(self.norm1(x), residual=x)
-        x = self.attn2(self.norm2(x), context=context, residual=x)
+        x = self.attn1(
+            self.norm1(x),
+            residual=x,
+            context=context if self.only_cross_attention else None,
+        )
+        if self.attn2 is not None:
+            x = self.attn2(self.norm2(x), context=context, residual=x)
         x = self.ff(self.norm3(x), residual=x)
         return x
 
 
-def Normalize(in_channels):
-    return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+def Normalize(in_channels, dtype="float16"):
+    return nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True, dtype=dtype
+    )
 
 
 class SpatialTransformer(nn.Module):
@@ -195,41 +220,73 @@ class SpatialTransformer(nn.Module):
     """
 
     def __init__(
-        self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        context_dim=None,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
         self.in_channels = in_channels
         inner_dim = n_heads * d_head
-        self.norm = Normalize(in_channels)  # Group Norm
+        self.norm = Normalize(in_channels, dtype=dtype)  # Group Norm
+        self.use_linear_projection = use_linear_projection
 
-        self.proj_in = nn.Conv2dBias(
-            in_channels, inner_dim, kernel_size=1, stride=1, padding=0
-        )
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim, dtype=dtype)
+        else:
+            self.proj_in = nn.Conv2dBias(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0, dtype=dtype
+            )
 
         self.transformer_blocks = nn.ModuleList(
             [
                 BasicTransformerBlock(
-                    inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim
+                    inner_dim,
+                    n_heads,
+                    d_head,
+                    dropout=dropout,
+                    context_dim=context_dim,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
                 for d in range(depth)
             ]
         )
 
-        self.proj_out = nn.Conv2dBias(
-            inner_dim, in_channels, kernel_size=1, stride=1, padding=0
-        )
+        if use_linear_projection:
+            self.proj_out = nn.Linear(inner_dim, in_channels, dtype=dtype)
+        else:
+            self.proj_out = nn.Conv2dBias(
+                inner_dim, in_channels, kernel_size=1, stride=1, padding=0, dtype=dtype
+            )
 
     def forward(self, x, context=None):
         # note: if no context is given, cross-attention defaults to self-attention
-        b, h, w, c = get_shape(x)
+        b, h, w, c = x.shape()
         x_in = x
         x = self.norm(x)
-        x = self.proj_in(x)
-        x = ops.reshape()(x, [b, -1, c])
+        if self.use_linear_projection:
+            x = self.proj_in(x)
+            x = ops.reshape()(x, [b, -1, c])
+        else:
+            x = self.proj_in(x)
+            x = ops.reshape()(x, [b, -1, c])
+
         for block in self.transformer_blocks:
             x = block(x, context=context)
-        x = ops.reshape()(x, [b, h, w, c])
-        x = self.proj_out(x)
+
+        if self.use_linear_projection:
+            x = self.proj_out(x)
+            x = ops.reshape()(x, [b, h, w, c])
+        else:
+            x = ops.reshape()(x, [b, h, w, c])
+            x = self.proj_out(x)
         return x + x_in
 
 
@@ -277,6 +334,18 @@ def forward(
         return self_output
 
 
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, x):
+        x1 = x * 1.702
+        x1 = ops.sigmoid(x1)
+        x = x * x1
+        return x
+
+
 class CLIPMLP(nn.Module):
     """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
 
@@ -295,18 +364,52 @@ def __init__(
         self.fc1 = nn.Linear(
             in_features,
             hidden_features,
-            specialization="fast_gelu",
+            specialization="gelu",
         )
         self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
 
     def forward(self, x, res):
-        shape = get_shape(x)
+        shape = x.shape()
         x = self.fc1(x)
         x = self.fc2(x, res)
         return ops.reshape()(x, shape)
 
 
+class CLIPMLPQuickGelu(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+        )
+        self.activation_fn = QuickGELUActivation()
+
+        self.fc2 = nn.Linear(hidden_features, out_features, specialization="add")
+
+    def forward(self, x, res):
+        shape = x.shape()
+        x = self.fc1(x)
+        x = self.activation_fn(x)
+        x = self.fc2(x, res)
+        return ops.reshape()(x, shape)
+
+
 class CLIPEncoderLayer(nn.Module):
+    ACT_LAYER_TO_CLIP_MLP_MAP = {
+        "gelu": CLIPMLP,
+        "quick_gelu": CLIPMLPQuickGelu,
+    }
+
     def __init__(
         self,
         hidden_size=768,
@@ -317,24 +420,33 @@ def __init__(
         seq_len=16,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
         self.embed_dim = hidden_size
-        self.self_attn = nn.MultiheadAttention(
-            dim=hidden_size,
-            batch_size=batch_size,
-            seq_len=seq_len,
-            num_heads=num_attention_heads,
-            qkv_bias=True,
-            attn_drop=attention_dropout,
-            proj_drop=0,
-            has_residual=True,
-            causal=causal,
-            mask_seq=mask_seq,
-            use_mem_eff=True,
-        )
+        if USE_CUDA:
+            self.self_attn = nn.CrossAttention(
+                hidden_size,
+                seq_len,
+                seq_len,
+                num_attention_heads,
+                qkv_bias=True,
+                causal=causal,
+            )
+        else:
+            self.self_attn = nn.MultiheadAttention(
+                hidden_size,
+                batch_size,
+                seq_len,
+                num_attention_heads,
+                qkv_bias=True,
+                causal=causal,
+            )
+
         self.layer_norm1 = nn.LayerNorm(self.embed_dim)
-        self.mlp = CLIPMLP(hidden_size, int(hidden_size * mlp_ratio))
+        self.mlp = self.ACT_LAYER_TO_CLIP_MLP_MAP[act_layer](
+            hidden_size, int(hidden_size * mlp_ratio)
+        )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim)
 
     def forward(
@@ -355,7 +467,14 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states = self.self_attn(hidden_states, residual)
+        if USE_CUDA:
+            hidden_states = self.self_attn(
+                hidden_states, hidden_states, hidden_states, residual
+            )
+        else:
+            hidden_states = self.self_attn(
+                hidden_states, residual
+            )
 
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
@@ -384,6 +503,7 @@ def __init__(
         seq_len=64,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
         self.layers = nn.ModuleList(
@@ -395,6 +515,7 @@ def __init__(
                     seq_len=seq_len,
                     causal=causal,
                     mask_seq=mask_seq,
+                    act_layer=act_layer,
                 )
                 for _ in range(num_hidden_layers)
             ]
@@ -454,12 +575,17 @@ def forward(
 
         hidden_states = inputs_embeds
         for _, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
+            if output_hidden_states and encoder_states is not None:
                 encoder_states = encoder_states + (hidden_states,)
             layer_outputs = encoder_layer(hidden_states)
             hidden_states = layer_outputs
 
-        return hidden_states
+        last_hidden_state = hidden_states
+        output = last_hidden_state
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+            output = encoder_states
+        return output
 
 
 class CLIPTextEmbeddings(nn.Module):
@@ -472,6 +598,9 @@ def __init__(
     ):
         super().__init__()
         embed_dim = hidden_size
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_dim = hidden_size
+        self.vocab_size = vocab_size
 
         self.token_embedding = nn.Embedding(shape=[vocab_size, embed_dim], dtype=dtype)
         self.position_embedding = nn.Embedding(
@@ -484,20 +613,25 @@ def forward(
         position_ids: Tensor,
         inputs_embeds: Optional[Tensor] = None,
     ) -> Tensor:
-
         input_shape = ops.size()(input_ids)
 
         # [B * S]
-        input_ids = ops.reshape()(input_ids, [-1])
-
-        position_ids = ops.reshape()(position_ids, [-1])
+        token_embedding = self.token_embedding.tensor()
+        token_embedding = ops.reshape()(
+            token_embedding, [1, self.vocab_size, self.embed_dim]
+        )
+        token_embedding = ops.expand()(token_embedding, [input_shape[0], -1, -1])
 
         if inputs_embeds is None:
-            inputs_embeds = ops.batch_gather()(self.token_embedding.tensor(), input_ids)
+            inputs_embeds = ops.batch_gather()(token_embedding, input_ids)
 
-        position_embeddings = ops.batch_gather()(
-            self.position_embedding.tensor(), position_ids
+        position_embedding = self.position_embedding.tensor()
+        position_embedding = ops.reshape()(
+            position_embedding, [1, self.max_position_embeddings, self.embed_dim]
         )
+        position_embedding = ops.expand()(position_embedding, [input_shape[0], -1, -1])
+
+        position_embeddings = ops.batch_gather()(position_embedding, position_ids)
 
         embeddings = inputs_embeds + position_embeddings
 
@@ -510,6 +644,7 @@ class CLIPTextTransformer(nn.Module):
     def __init__(
         self,
         hidden_size=768,
+        text_projection_dim=None,
         output_attentions=False,
         output_hidden_states=False,
         use_return_dict=False,
@@ -519,9 +654,9 @@ def __init__(
         seq_len=64,
         causal=False,
         mask_seq=0,
+        act_layer="gelu",
     ):
         super().__init__()
-        embed_dim = hidden_size
         self.embeddings = CLIPTextEmbeddings(hidden_size=hidden_size)
         self.encoder = CLIPEncoder(
             num_hidden_layers=num_hidden_layers,
@@ -531,12 +666,22 @@ def __init__(
             seq_len=seq_len,
             causal=causal,
             mask_seq=mask_seq,
+            act_layer=act_layer,
         )
-        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.final_layer_norm = nn.LayerNorm(hidden_size)
+        if text_projection_dim is not None:
+            self.text_projection = nn.Linear(
+                hidden_size, text_projection_dim, bias=False
+            )
+        else:
+            self.text_projection = None
 
         self.output_attentions = output_attentions
         self.output_hidden_states = output_hidden_states
         self.use_return_dict = use_return_dict
+        self.hidden_size = hidden_size
+        self.seq_len = seq_len
+        self.num_layers = num_hidden_layers
 
     def forward(
         self,
@@ -550,27 +695,40 @@ def forward(
         r"""
         Returns:
         """
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify either input_ids")
+        batch = ops.size()(input_ids)[0]
 
         hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
 
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
+        encoder_output = self.encoder(
+            inputs_embeds=hidden_states, output_hidden_states=self.output_hidden_states
         )
-
-        last_hidden_state = encoder_outputs
+        if self.output_hidden_states:
+            last_hidden_state = encoder_output[-1]
+        else:
+            last_hidden_state = encoder_output
         last_hidden_state = self.final_layer_norm(last_hidden_state)
-        return last_hidden_state
+
+        argmax = ops.argmax(-1)(input_ids)
+        pooled_output = ops.index_select(dim=1)(last_hidden_state, argmax)
+        pooled_output = ops.reshape()(pooled_output, [batch, self.hidden_size])
+        last_hidden_state._attrs["is_output"] = True
+        last_hidden_state._attrs["name"] = "last_hidden_state"
+        pooled_output._attrs["is_output"] = True
+        pooled_output._attrs["name"] = "pooled_output"
+        output = (
+            last_hidden_state,
+            pooled_output,
+        )
+        if self.text_projection is not None:
+            text_embeds = self.text_projection(pooled_output)
+            text_embeds._attrs["is_output"] = True
+            text_embeds._attrs["name"] = "text_embeds"
+            output = output + (text_embeds,)
+
+        if self.output_hidden_states:
+            for idx, hidden_state in enumerate(encoder_output[:-1]):
+                hidden_state._attrs["is_output"] = True
+                hidden_state._attrs["name"] = f"hidden_state_{idx}"
+                output = output + (hidden_state,)
+
+        return output
diff --git a/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
new file mode 100644
index 000000000..ad7a0e7db
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/controlnet_unet_2d_condition.py
@@ -0,0 +1,275 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Optional, Tuple, Union
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, UNetMidBlock2DCrossAttn
+
+
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+
+    def __init__(
+        self,
+        # conditioning_embedding_channels: int,
+        # conditioning_channels: int = 3,
+        # block_out_channels: Tuple[int] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+        """
+        Note: This is different to diffusers ControlNetConditioningEmbedding
+        Required Conv2dBiasFewChannels for the first layer, then Conv2dBias for the rest
+        Could be changed back to a loop and use parameters though,
+        but it ended up like this when debugging.
+        """
+        conv_op = (
+            nn.Conv2dBiasFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBias
+        )
+        self.conv_in = conv_op(3, 16, 3, 1, 1)
+
+        self.blocks = nn.ModuleList([])
+        self.blocks.append(nn.Conv2dBias(16, 16, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(16, 32, 3, 2, 1))
+        self.blocks.append(nn.Conv2dBias(32, 32, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(32, 96, 3, 2, 1))
+        self.blocks.append(nn.Conv2dBias(96, 96, 3, 1, 1))
+        self.blocks.append(nn.Conv2dBias(96, 256, 3, 2, 1))
+
+        self.conv_out = nn.Conv2dBias(256, 320, 3, 1, 1)
+
+    def forward(self, conditioning):
+        """
+        Padding required!
+        """
+        pad = ops.nhwc3to4()
+        conditioning = pad(conditioning)
+        embedding = self.conv_in(conditioning)
+        embedding = ops.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = ops.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+class ControlNetModel(nn.Module):
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 768,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+    ):
+        super().__init__()
+        self.controlnet_conditioning_channel_order = (
+            controlnet_conditioning_channel_order
+        )
+        self.global_pool_conditions = global_pool_conditions
+
+        # input
+        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+        )
+
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding()
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+        controlnet_block = controlnet_block
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+                controlnet_block = controlnet_block
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2dBias(output_channel, output_channel, 1)
+                controlnet_block = controlnet_block
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2dBias(mid_block_channel, mid_block_channel, 1)
+        controlnet_block = controlnet_block
+        self.controlnet_mid_block = controlnet_block
+
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )
+
+    def get_shape(self, sample):
+        return [i._attrs["int_var"]._attrs["values"][0] for i in ops.size()(sample)]
+
+    def forward(
+        self,
+        sample,
+        timestep,
+        encoder_hidden_states,
+        controlnet_cond,
+        conditioning_scale: float = 1.0,
+    ) -> Tuple:
+        t_emb = self.time_proj(timestep)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        controlnet_cond._attrs["shape"] = sample._attrs["shape"]
+        sample = sample + controlnet_cond
+        # 3. down
+        down_block_res_samples = (sample,)  # up to but excluding last element
+        sample, res_samples = self.down_blocks[0](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[1](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[2](
+            hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states
+        )
+        down_block_res_samples += res_samples
+        sample, res_samples = self.down_blocks[3](hidden_states=sample, temb=emb)
+        down_block_res_samples += res_samples
+        # return sample
+
+        # 4. mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states
+        )
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(
+            down_block_res_samples, self.controlnet_down_blocks
+        ):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (
+                down_block_res_sample,
+            )
+
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        down_block_res_samples = [
+            sample * conditioning_scale for sample in down_block_res_samples
+        ]
+        mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        return (
+            down_block_res_samples[0],
+            down_block_res_samples[1],
+            down_block_res_samples[2],
+            down_block_res_samples[3],
+            down_block_res_samples[4],
+            down_block_res_samples[5],
+            down_block_res_samples[6],
+            down_block_res_samples[7],
+            down_block_res_samples[8],
+            down_block_res_samples[9],
+            down_block_res_samples[10],
+            down_block_res_samples[11],
+            mid_block_res_sample,
+        )
diff --git a/examples/05_stable_diffusion/modeling/embeddings.py b/examples/05_stable_diffusion/src/modeling/embeddings.py
similarity index 78%
rename from examples/05_stable_diffusion/modeling/embeddings.py
rename to examples/05_stable_diffusion/src/modeling/embeddings.py
index 36b96a4fb..e014c0530 100644
--- a/examples/05_stable_diffusion/modeling/embeddings.py
+++ b/examples/05_stable_diffusion/src/modeling/embeddings.py
@@ -30,6 +30,8 @@ def get_timestep_embedding(
     downscale_freq_shift: float = 1,
     scale: float = 1,
     max_period: int = 10000,
+    dtype: str = "float16",
+    arange_name="arange",
 ):
     """
     This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
@@ -39,12 +41,12 @@ def get_timestep_embedding(
     :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
     embeddings. :return: an [N x dim] Tensor of positional embeddings.
     """
-    assert len(get_shape(timesteps)) == 1, "Timesteps should be a 1d-array"
+    assert timesteps._rank() == 1, "Timesteps should be a 1d-array"
 
     half_dim = embedding_dim // 2
 
     exponent = (-math.log(max_period)) * Tensor(
-        shape=[half_dim], dtype="float16", name="arange"
+        shape=[half_dim], dtype=dtype, name=arange_name
     )
 
     exponent = exponent * (1.0 / (half_dim - downscale_freq_shift))
@@ -70,11 +72,19 @@ def get_timestep_embedding(
 
 
 class TimestepEmbedding(nn.Module):
-    def __init__(self, channel: int, time_embed_dim: int, act_fn: str = "silu"):
+    def __init__(
+        self,
+        channel: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        dtype: str = "float16",
+    ):
         super().__init__()
 
-        self.linear_1 = nn.Linear(channel, time_embed_dim, specialization="swish")
-        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim)
+        self.linear_1 = nn.Linear(
+            channel, time_embed_dim, specialization="swish", dtype=dtype
+        )
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim, dtype=dtype)
 
     def forward(self, sample):
         sample = self.linear_1(sample)
@@ -84,12 +94,19 @@ def forward(self, sample):
 
 class Timesteps(nn.Module):
     def __init__(
-        self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float
+        self,
+        num_channels: int,
+        flip_sin_to_cos: bool,
+        downscale_freq_shift: float,
+        dtype: str = "float16",
+        arange_name="arange",
     ):
         super().__init__()
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
         self.downscale_freq_shift = downscale_freq_shift
+        self.dtype = dtype
+        self.arange_name = arange_name
 
     def forward(self, timesteps):
         t_emb = get_timestep_embedding(
@@ -97,5 +114,7 @@ def forward(self, timesteps):
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.downscale_freq_shift,
+            dtype=self.dtype,
+            arange_name=self.arange_name,
         )
         return t_emb
diff --git a/examples/05_stable_diffusion/modeling/resnet.py b/examples/05_stable_diffusion/src/modeling/resnet.py
similarity index 79%
rename from examples/05_stable_diffusion/modeling/resnet.py
rename to examples/05_stable_diffusion/src/modeling/resnet.py
index 03e4f8023..1262ac86f 100644
--- a/examples/05_stable_diffusion/modeling/resnet.py
+++ b/examples/05_stable_diffusion/src/modeling/resnet.py
@@ -37,6 +37,7 @@ def __init__(
         use_conv_transpose=False,
         out_channels=None,
         name="conv",
+        dtype="float16",
     ):
         super().__init__()
         self.channels = channels
@@ -47,9 +48,11 @@ def __init__(
 
         conv = None
         if use_conv_transpose:
-            conv = nn.ConvTranspose2dBias(channels, self.out_channels, 4, 2, 1)
+            conv = nn.ConvTranspose2dBias(
+                channels, self.out_channels, 4, 2, 1, dtype=dtype
+            )
         elif use_conv:
-            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1)
+            conv = nn.Conv2dBias(self.channels, self.out_channels, 3, 1, 1, dtype=dtype)
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
         if name == "conv":
@@ -58,10 +61,8 @@ def __init__(
             self.Conv2d_0 = conv
 
     def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
         if self.use_conv_transpose:
             return self.conv(x)
-
         x = nn.Upsampling2d(scale_factor=2.0, mode="nearest")(x)
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
@@ -84,7 +85,13 @@ class Downsample2D(nn.Module):
     """
 
     def __init__(
-        self, channels, use_conv=False, out_channels=None, padding=1, name="conv"
+        self,
+        channels,
+        use_conv=False,
+        out_channels=None,
+        padding=1,
+        name="conv",
+        dtype="float16",
     ):
         super().__init__()
         self.channels = channels
@@ -93,10 +100,16 @@ def __init__(
         self.padding = padding
         stride = 2
         self.name = name
+        self.dtype = dtype
 
         if use_conv:
             conv = nn.Conv2dBias(
-                self.channels, self.out_channels, 3, stride=stride, padding=padding
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                dtype=dtype,
+                padding=padding,
             )
         else:
             assert self.channels == self.out_channels
@@ -111,11 +124,21 @@ def __init__(
         else:
             self.conv = conv
 
-    def forward(self, x):
-        assert get_shape(x)[-1] == self.channels
-        x = self.conv(x)
+    def forward(self, hidden_states):
+        if self.use_conv and self.padding == 0:
+            padding = ops.full()([0, 1, 0, 0], 0.0, dtype=self.dtype)
+            padding._attrs["shape"][0] = hidden_states._attrs["shape"][0]
+            padding._attrs["shape"][2] = hidden_states._attrs["shape"][2]
+            padding._attrs["shape"][3] = hidden_states._attrs["shape"][3]
+            hidden_states = ops.concatenate()([hidden_states, padding], dim=1)
+            padding = ops.full()([0, 0, 1, 0], 0.0, dtype=self.dtype)
+            padding._attrs["shape"][0] = hidden_states._attrs["shape"][0]
+            padding._attrs["shape"][1] = hidden_states._attrs["shape"][1]
+            padding._attrs["shape"][3] = hidden_states._attrs["shape"][3]
+            hidden_states = ops.concatenate()([hidden_states, padding], dim=2)
 
-        return x
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
 
 
 class ResnetBlock2D(nn.Module):
@@ -138,6 +161,7 @@ def __init__(
         use_nin_shortcut=None,
         up=False,
         down=False,
+        dtype="float16"
     ):
         super().__init__()
         self.pre_norm = pre_norm
@@ -160,14 +184,15 @@ def __init__(
             eps=eps,
             affine=True,
             use_swish=True,
+            dtype=dtype,
         )
 
         self.conv1 = nn.Conv2dBias(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype
         )
 
         if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels, dtype=dtype)
         else:
             self.time_emb_proj = None
 
@@ -177,10 +202,11 @@ def __init__(
             eps=eps,
             affine=True,
             use_swish=True,
+            dtype=dtype,
         )
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = nn.Dropout(dropout, dtype=dtype)
         self.conv2 = nn.Conv2dBias(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, dtype=dtype
         )
 
         self.upsample = self.downsample = None
@@ -193,7 +219,7 @@ def __init__(
 
         if self.use_nin_shortcut:
             self.conv_shortcut = nn.Conv2dBias(
-                in_channels, out_channels, 1, 1, 0
+                in_channels, out_channels, 1, 1, 0, dtype=dtype
             )  # kernel_size=1, stride=1, padding=0) # conv_bias_add
         else:
             self.conv_shortcut = None
@@ -219,7 +245,7 @@ def forward(self, x, temb=None):
 
         if temb is not None:
             temb = self.time_emb_proj(ops.silu(temb))
-            bs, dim = get_shape(temb)
+            bs, dim = temb.shape()
             temb = ops.reshape()(temb, [bs, 1, 1, dim])
             hidden_states = hidden_states + temb
 
diff --git a/examples/05_stable_diffusion/modeling/unet_2d_condition.py b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
similarity index 60%
rename from examples/05_stable_diffusion/modeling/unet_2d_condition.py
rename to examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
index a21879dea..2e5d33a68 100644
--- a/examples/05_stable_diffusion/modeling/unet_2d_condition.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_2d_condition.py
@@ -14,10 +14,13 @@
 #
 from typing import Optional, Tuple, Union
 
-from aitemplate.frontend import nn
+from aitemplate.compiler import ops
 
-from modeling.embeddings import TimestepEmbedding, Timesteps
-from modeling.unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
+from aitemplate.frontend import nn, Tensor
+from aitemplate.testing import detect_target
+
+from .embeddings import TimestepEmbedding, Timesteps
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2DCrossAttn
 
 
 class UNet2DConditionModel(nn.Module):
@@ -50,6 +53,7 @@ class UNet2DConditionModel(nn.Module):
         norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
         cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
         attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        use_linear_projection (`bool`, *optional*, defaults to False): Use linear projection instead of 1x1 convolution.
     """
 
     def __init__(
@@ -81,19 +85,70 @@ def __init__(
         norm_eps: float = 1e-5,
         cross_attention_dim: int = 1280,
         attention_head_dim: Union[int, Tuple[int]] = 8,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        only_cross_attention=[True, True, True, False],
+        conv_in_kernel=3,
+        dtype="float16",
+        time_embedding_dim=None,
+        projection_class_embeddings_input_dim=None,
+        addition_embed_type=None,
+        transformer_layers_per_block=[1, 1, 1, 1],
     ):
         super().__init__()
         self.center_input_sample = center_input_sample
         self.sample_size = sample_size
-        time_embed_dim = block_out_channels[0] * 4
+        self.time_embedding_dim = time_embedding_dim
+        time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
 
         # input
-        self.conv_in = nn.Conv2dBias(in_channels, block_out_channels[0], 3, 1, 1)
+        self.in_channels = in_channels
+        if self.in_channels % 4 != 0:
+            in_channels = self.in_channels + (4 - (self.in_channels % 4))
+        else:
+            in_channels = self.in_channels
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        print("in_channels", in_channels)
+        if in_channels < 8 and detect_target().name() == "cuda":
+            self.conv_in = nn.Conv2dBiasFewChannels(
+                in_channels, block_out_channels[0], 3, 1, conv_in_padding, dtype=dtype
+            )
+        else:
+            self.conv_in = nn.Conv2dBias(
+                in_channels, block_out_channels[0], 3, 1, conv_in_padding, dtype=dtype
+            )
         # time
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        self.time_proj = Timesteps(
+            block_out_channels[0],
+            flip_sin_to_cos,
+            freq_shift,
+            dtype=dtype,
+            arange_name="arange",
+        )
         timestep_input_dim = block_out_channels[0]
 
-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim, time_embed_dim, dtype=dtype
+        )
+        self.class_embed_type = class_embed_type
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(
+                [num_class_embeds, time_embed_dim], dtype=dtype
+            )
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(
+                timestep_input_dim, time_embed_dim, dtype=dtype
+            )
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(dtype=dtype)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text_time":
+            self.add_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim, dtype=dtype
+            )
 
         self.down_blocks = nn.ModuleList([])
         self.up_blocks = nn.ModuleList([])
@@ -107,24 +162,28 @@ def __init__(
             input_channel = output_channel
             output_channel = block_out_channels[i]
             is_final_block = i == len(block_out_channels) - 1
-
             down_block = get_down_block(
                 down_block_type,
                 num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=time_embed_dim,
                 add_downsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
                 downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                dtype=dtype,
             )
             self.down_blocks.append(down_block)
 
         # mid
         self.mid_block = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block[-1],
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
             resnet_eps=norm_eps,
@@ -134,12 +193,16 @@ def __init__(
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attention_head_dim[-1],
             resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            dtype=dtype,
         )
 
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         reversed_attention_head_dim = list(reversed(attention_head_dim))
-
+        reversed_transformer_layers_per_block = list(
+            reversed(transformer_layers_per_block)
+        )
         output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
@@ -153,6 +216,7 @@ def __init__(
             up_block = get_up_block(
                 up_block_type,
                 num_layers=layers_per_block + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
@@ -160,8 +224,11 @@ def __init__(
                 add_upsample=not is_final_block,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
-                cross_attention_dim=cross_attention_dim,
                 attn_num_head_channels=reversed_attention_head_dim[i],
+                cross_attention_dim=cross_attention_dim,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                dtype=dtype,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel
@@ -172,15 +239,33 @@ def __init__(
             num_groups=norm_num_groups,
             eps=norm_eps,
             use_swish=True,
+            dtype=dtype,
         )
 
-        self.conv_out = nn.Conv2dBias(block_out_channels[0], out_channels, 3, 1, 1)
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0], out_channels, 3, 1, 1, dtype=dtype
+        )
 
     def forward(
         self,
         sample,
         timesteps,
         encoder_hidden_states,
+        down_block_residual_0=None,
+        down_block_residual_1=None,
+        down_block_residual_2=None,
+        down_block_residual_3=None,
+        down_block_residual_4=None,
+        down_block_residual_5=None,
+        down_block_residual_6=None,
+        down_block_residual_7=None,
+        down_block_residual_8=None,
+        down_block_residual_9=None,
+        down_block_residual_10=None,
+        down_block_residual_11=None,
+        mid_block_residual=None,
+        class_labels: Optional[Tensor] = None,
+        add_embeds: Optional[Tensor] = None,
         return_dict: bool = True,
     ):
         """r
@@ -196,12 +281,50 @@ def forward(
             [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
         """
+        down_block_additional_residuals = (
+            down_block_residual_0,
+            down_block_residual_1,
+            down_block_residual_2,
+            down_block_residual_3,
+            down_block_residual_4,
+            down_block_residual_5,
+            down_block_residual_6,
+            down_block_residual_7,
+            down_block_residual_8,
+            down_block_residual_9,
+            down_block_residual_10,
+            down_block_residual_11,
+        )
+        mid_block_additional_residual = mid_block_residual
+        if down_block_additional_residuals[0] is None:
+            down_block_additional_residuals = None
 
         # 1. time
         t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError(
+                    "class_labels should be provided when num_class_embeds > 0"
+                )
+
+            if self.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = ops.batch_gather()(
+                self.class_embedding.weight.tensor(), class_labels
+            )
+            emb = emb + class_emb
+
+        if add_embeds is not None:
+            aug_emb = self.add_embedding(add_embeds)
+            emb = emb + aug_emb
 
         # 2. pre-process
+        if self.in_channels % 4 != 0:
+            channel_pad = self.in_channels + (4 - (self.in_channels % 4))
+            sample = ops.pad_last_dim(4, channel_pad)(sample)
+
         sample = self.conv_in(sample)
 
         # 3. down
@@ -220,14 +343,32 @@ def forward(
                 sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
 
             down_block_res_samples += res_samples
+            # return sample
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_additional_residual._attrs[
+                    "shape"
+                ] = down_block_res_sample._attrs["shape"]
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
 
         # 4. mid
         sample = self.mid_block(
             sample, emb, encoder_hidden_states=encoder_hidden_states
         )
 
+        if mid_block_additional_residual is not None:
+            mid_block_additional_residual._attrs["shape"] = sample._attrs["shape"]
+            sample += mid_block_additional_residual
         # 5. up
-        for upsample_block in self.up_blocks:
+        for i, upsample_block in enumerate(self.up_blocks):
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[
                 : -len(upsample_block.resnets)
@@ -245,7 +386,9 @@ def forward(
                 )
             else:
                 sample = upsample_block(
-                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
                 )
 
         # 6. post-process
@@ -253,4 +396,6 @@ def forward(
         # when running in half-precision
         sample = self.conv_norm_out(sample)
         sample = self.conv_out(sample)
+        sample._attrs["is_output"] = True
+        sample._attrs["name"] = "latent_output"
         return sample
diff --git a/examples/05_stable_diffusion/modeling/unet_blocks.py b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
similarity index 81%
rename from examples/05_stable_diffusion/modeling/unet_blocks.py
rename to examples/05_stable_diffusion/src/modeling/unet_blocks.py
index 75de2e0c8..2a7f5ffc7 100644
--- a/examples/05_stable_diffusion/modeling/unet_blocks.py
+++ b/examples/05_stable_diffusion/src/modeling/unet_blocks.py
@@ -30,10 +30,11 @@
 
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
-from modeling.attention import AttentionBlock
 
-from modeling.clip import SpatialTransformer
-from modeling.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from .attention import AttentionBlock
+
+from .clip import SpatialTransformer
+from .resnet import Downsample2D, ResnetBlock2D, Upsample2D
 
 # pylint: disable=W0102
 
@@ -48,8 +49,13 @@ def get_down_block(
     resnet_eps,
     resnet_act_fn,
     attn_num_head_channels,
+    transformer_layers_per_block=1,
     cross_attention_dim=None,
     downsample_padding=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    resnet_groups=32,
+    dtype="float16",
 ):
     down_block_type = (
         down_block_type[7:]
@@ -65,7 +71,9 @@ def get_down_block(
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
+            dtype=dtype,
         )
     elif down_block_type == "AttnDownBlock2D":
         return AttnDownBlock2D(
@@ -86,15 +94,20 @@ def get_down_block(
             )
         return CrossAttnDownBlock2D(
             num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
             add_downsample=add_downsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
             downsample_padding=downsample_padding,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            dtype=dtype,
         )
     elif down_block_type == "SkipDownBlock2D":
         return SkipDownBlock2D(
@@ -121,13 +134,16 @@ def get_down_block(
         )
     elif down_block_type == "DownEncoderBlock2D":
         return DownEncoderBlock2D(
-            num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
-            add_downsample=add_downsample,
+            num_layers=num_layers,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            output_scale_factor=1.0,
+            add_downsample=add_downsample,
             downsample_padding=downsample_padding,
+            dtype=dtype,
         )
 
 
@@ -142,7 +158,11 @@ def get_up_block(
     resnet_eps,
     resnet_act_fn,
     attn_num_head_channels,
+    transformer_layers_per_block=1,
     cross_attention_dim=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    dtype="float16",
 ):
     up_block_type = (
         up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
@@ -157,6 +177,7 @@ def get_up_block(
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            dtype=dtype,
         )
     elif up_block_type == "CrossAttnUpBlock2D":
         if cross_attention_dim is None:
@@ -164,6 +185,7 @@ def get_up_block(
                 "cross_attention_dim must be specified for CrossAttnUpBlock2D"
             )
         return CrossAttnUpBlock2D(
+            transformer_layers_per_block=transformer_layers_per_block,
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -174,6 +196,9 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             cross_attention_dim=cross_attention_dim,
             attn_num_head_channels=attn_num_head_channels,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            dtype=dtype,
         )
     elif up_block_type == "AttnUpBlock2D":
         return AttnUpBlock2D(
@@ -218,6 +243,7 @@ def get_up_block(
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
+            dtype=dtype,
         )
     raise ValueError(f"{up_block_type} does not exist.")
 
@@ -229,6 +255,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block=1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -238,6 +265,8 @@ def __init__(
         attention_type="default",
         output_scale_factor=1.0,
         cross_attention_dim=1280,
+        use_linear_projection=False,
+        dtype="float16",
         **kwargs,
     ):
         super().__init__()
@@ -261,6 +290,7 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
+                dtype=dtype,
             )
         ]
         attentions = []
@@ -271,8 +301,10 @@ def __init__(
                     in_channels,
                     attn_num_head_channels,
                     in_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    dtype=dtype,
                 )
             )
             resnets.append(
@@ -287,6 +319,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -310,6 +343,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -321,6 +355,9 @@ def __init__(
         output_scale_factor=1.0,
         downsample_padding=1,
         add_downsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
 
@@ -344,6 +381,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
             attentions.append(
@@ -351,8 +389,11 @@ def __init__(
                     out_channels,
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -367,6 +408,7 @@ def __init__(
                         out_channels=out_channels,
                         padding=downsample_padding,
                         name="op",
+                        dtype=dtype,
                     )
                 ]
             )
@@ -406,6 +448,7 @@ def __init__(
         output_scale_factor=1.0,
         add_downsample=True,
         downsample_padding=1,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -424,6 +467,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -438,6 +482,7 @@ def __init__(
                         out_channels=out_channels,
                         padding=downsample_padding,
                         name="op",
+                        dtype=dtype,
                     )
                 ]
             )
@@ -469,6 +514,7 @@ def __init__(
         temb_channels: int,
         dropout: float = 0.0,
         num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
         resnet_eps: float = 1e-6,
         resnet_time_scale_shift: str = "default",
         resnet_act_fn: str = "swish",
@@ -480,6 +526,9 @@ def __init__(
         output_scale_factor=1.0,
         downsample_padding=1,
         add_upsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        dtype="float16",
     ):
         super().__init__()
 
@@ -505,6 +554,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
             attentions.append(
@@ -512,8 +562,11 @@ def __init__(
                     out_channels,
                     attn_num_head_channels,
                     out_channels // attn_num_head_channels,
-                    depth=1,
+                    depth=transformer_layers_per_block,
                     context_dim=cross_attention_dim,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    dtype=dtype,
                 )
             )
         self.attentions = nn.ModuleList(attentions)
@@ -521,7 +574,14 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
@@ -567,6 +627,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -587,6 +648,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -594,12 +656,21 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
 
-    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+    def forward(
+        self, hidden_states, res_hidden_states_tuple, temb=None
+    ):
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
@@ -617,6 +688,82 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         return hidden_states
 
 
+def shape_to_list(shape):
+    return [
+        sample["symbolic_value"]
+        if type(sample) == Tensor
+        else sample._attrs["symbolic_value"]
+        for sample in shape
+    ]
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        dtype="float16",
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    dtype=dtype,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                        dtype=dtype,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
 class UpDecoderBlock2D(nn.Module):
     def __init__(
         self,
@@ -631,6 +778,7 @@ def __init__(
         resnet_pre_norm: bool = True,
         output_scale_factor=1.0,
         add_upsample=True,
+        dtype="float16",
     ):
         super().__init__()
         resnets = []
@@ -650,6 +798,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
@@ -657,7 +806,14 @@ def __init__(
 
         if add_upsample:
             self.upsamplers = nn.ModuleList(
-                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+                [
+                    Upsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        dtype=dtype,
+                    )
+                ]
             )
         else:
             self.upsamplers = None
@@ -691,6 +847,7 @@ def __init__(
         attn_num_head_channels=1,
         attention_type="default",
         output_scale_factor=1.0,
+        dtype="float16",
         **kwargs,
     ):
         super().__init__()
@@ -717,10 +874,10 @@ def __init__(
                 non_linearity=resnet_act_fn,
                 output_scale_factor=output_scale_factor,
                 pre_norm=resnet_pre_norm,
+                dtype=dtype,
             )
         ]
         attentions = []
-
         for _ in range(num_layers):
             attentions.append(
                 AttentionBlock(
@@ -732,6 +889,7 @@ def __init__(
                     rescale_output_factor=output_scale_factor,
                     eps=resnet_eps,
                     num_groups=resnet_groups,
+                    dtype=dtype,
                 )
             )
             resnets.append(
@@ -746,6 +904,7 @@ def __init__(
                     non_linearity=resnet_act_fn,
                     output_scale_factor=output_scale_factor,
                     pre_norm=resnet_pre_norm,
+                    dtype=dtype,
                 )
             )
 
diff --git a/examples/05_stable_diffusion/src/modeling/vae.py b/examples/05_stable_diffusion/src/modeling/vae.py
new file mode 100644
index 000000000..576d1f058
--- /dev/null
+++ b/examples/05_stable_diffusion/src/modeling/vae.py
@@ -0,0 +1,334 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Translated from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/vae.py.
+"""
+
+from typing import Tuple
+
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import nn, Tensor
+
+from .unet_blocks import get_down_block, get_up_block, UNetMidBlock2D
+from aitemplate.testing import detect_target
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        act_fn="silu",
+        dtype="float16",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2dBias(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
+        )
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=32,
+            temb_channels=None,
+            dtype=dtype,
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                temb_channels=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=None,
+                dtype=dtype,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = 32
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=num_groups_out,
+            eps=1e-6,
+            use_swish=True,
+            dtype=dtype,
+        )
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            dtype=dtype,
+        )
+
+    def forward(self, z) -> Tensor:
+        sample = z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for up_block in self.up_blocks:
+            sample = up_block(sample)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        batch_size,
+        height,
+        width,
+        in_channels=3,
+        out_channels=3,
+        down_block_types=("DownEncoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+        double_z=True,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+        conv_op = (
+            nn.Conv2dBiasFewChannels
+            if detect_target().name() == "cuda"
+            else nn.Conv2dBias
+        )
+        self.conv_in = conv_op(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
+        )
+
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attn_num_head_channels=None,
+                temb_channels=None,
+                dtype=dtype,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            batch_size,
+            height,
+            width,
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            dtype=dtype,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[-1],
+            num_groups=norm_num_groups,
+            eps=1e-6,
+            dtype=dtype,
+        )
+        self.conv_act = ops.silu
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2dBias(
+            block_out_channels[-1],
+            conv_out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dtype=dtype,
+        )
+
+    def forward(self, x):
+        sample = x
+
+        sample = self.conv_in(sample)
+
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        height: int,
+        width: int,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        dtype="float16",
+    ):
+        super().__init__()
+        self.decoder = Decoder(
+            batch_size,
+            height,
+            width,
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            dtype=dtype,
+        )
+        self.post_quant_conv = nn.Conv2dBias(
+            latent_channels,
+            latent_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dtype=dtype,
+        )
+
+        self.encoder = Encoder(
+            batch_size,
+            height,
+            width,
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            dtype=dtype,
+        )
+        self.quant_conv = nn.Conv2dBias(
+            2 * latent_channels,
+            2 * latent_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dtype=dtype,
+        )
+
+    def decode(self, z: Tensor, return_dict: bool = True):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        dec._attrs["is_output"] = True
+        dec._attrs["name"] = "pixels"
+        return dec
+
+    def encode(
+        self,
+        x: Tensor,
+        sample: Tensor = None,
+        return_dict: bool = True,
+        deterministic: bool = False,
+    ):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        if sample is None:
+            return moments
+        mean, logvar = ops.chunk()(moments, 2, dim=3)
+        logvar = ops.clamp()(logvar, -30.0, 20.0)
+        std = ops.exp(0.5 * logvar)
+        # var = ops.exp(logvar)
+        # if deterministic:
+        #     var = std = Tensor(mean.shape(), value=0.0, dtype=mean._attrs["dtype"])
+        sample._attrs["shape"] = mean._attrs["shape"]
+        std._attrs["shape"] = mean._attrs["shape"]
+        z = mean + std * sample
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "latent"
+        return z
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
similarity index 94%
rename from examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
rename to examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
index 3a14debcc..a89f43109 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait.py
@@ -108,6 +108,7 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
+        self.batch = 1
 
     def init_ait_module(
         self,
@@ -129,15 +130,16 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
             "input2": encoder_hidden_states.cuda().half(),
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=64):
+    def clip_inference(self, input_ids, seqlen=77):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -146,9 +148,10 @@ def clip_inference(self, input_ids, seqlen=64):
             "input1": position_ids,
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -157,9 +160,10 @@ def vae_inference(self, vae_input):
         exe_module = self.vae_ait_exe
         inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -196,7 +200,7 @@ def __call__(
                 expense of slower inference.
             guidance_scale (`float`, *optional*, defaults to 7.5):
                 Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
                 Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
@@ -254,11 +258,13 @@ def __call__(
                 f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
             )
 
+        self.batch = batch_size
+
         # get prompt text embeddings
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
@@ -293,6 +299,7 @@ def __call__(
                 uncond_tokens,
                 padding="max_length",
                 max_length=max_length,
+                truncation=True,
                 return_tensors="pt",
             )
             uncond_embeddings = self.clip_inference(
@@ -346,6 +353,12 @@ def __call__(
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
 
         for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
             # expand the latents if we are doing classifier free guidance
@@ -390,11 +403,11 @@ def __call__(
 
         # run safety checker
         if self.safety_checker is not None:
-            safety_cheker_input = self.feature_extractor(
+            safety_checker_input = self.feature_extractor(
                 self.numpy_to_pil(image), return_tensors="pt"
             ).to(self.device)
             image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_cheker_input.pixel_values
+                images=image, clip_input=safety_checker_input.pixel_values
             )
         else:
             has_nsfw_concept = None
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
new file mode 100644
index 000000000..419184628
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_ait_alt.py
@@ -0,0 +1,1064 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+
+import os
+import re
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+
+from diffusers import AutoencoderKL, EulerDiscreteScheduler, UNet2DConditionModel
+
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils.pil_utils import numpy_to_pil
+from tqdm import tqdm
+
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from .compile_lib.compile_vae_alt import map_vae
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, additional_replacements=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+
+def convert_ldm_vae_checkpoint(vae_state_dict):
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
+        "encoder.conv_out.weight"
+    ]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
+        "encoder.norm_out.weight"
+    ]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
+        "encoder.norm_out.bias"
+    ]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
+        "decoder.conv_out.weight"
+    ]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
+        "decoder.norm_out.weight"
+    ]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
+        "decoder.norm_out.bias"
+    ]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "encoder.down" in layer
+        }
+    )
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
+        for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "decoder.up" in layer
+        }
+    )
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
+        for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key
+        ]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+# =================#
+# UNet Conversion #
+# =================#
+def convert_ldm_unet_checkpoint(unet_state_dict, layers_per_block=2):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
+        "time_embed.0.weight"
+    ]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
+        "time_embed.0.bias"
+    ]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
+        "time_embed.2.weight"
+    ]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
+        "time_embed.2.bias"
+    ]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "input_blocks" in layer
+        }
+    )
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "middle_block" in layer
+        }
+    )
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "output_blocks" in layer
+        }
+    )
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (layers_per_block + 1)
+
+        resnets = [
+            key
+            for key in input_blocks[i]
+            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.weight"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.weight")
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.bias"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path]
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths,
+        new_checkpoint,
+        unet_state_dict,
+        additional_replacements=[meta_path],
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (layers_per_block + 1)
+        layer_in_block_id = i % (layers_per_block + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [
+                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
+            ]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(
+                    ["conv.bias", "conv.weight"]
+                )
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.weight"]
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.bias"]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(
+                output_block_layers, n_shave_prefix_segments=1
+            )
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    (
+        "token_embedding.weight",
+        "transformer.text_model.embeddings.token_embedding.weight",
+    ),
+    (
+        "positional_embedding",
+        "transformer.text_model.embeddings.position_embedding.weight",
+    ),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_text_enc_state_dict(state_dict):
+    if "transformer.resblocks.22.ln_1.bias" not in state_dict.keys():
+        return state_dict  # SD1.x
+    new_state_dict = {}
+    d_model = 1024
+    for key, arr in state_dict.items():
+        if "resblocks.23" in key:
+            continue  # diffusers skips the last layer
+        if key in textenc_conversion_map:
+            new_state_dict[textenc_conversion_map[key]] = arr
+        if key.startswith("transformer."):
+            new_key = key[len("transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key + ".q_proj.weight"] = arr[:d_model, :]
+                new_state_dict[new_key + ".k_proj.weight"] = arr[
+                    d_model : d_model * 2, :
+                ]
+                new_state_dict[new_key + ".v_proj.weight"] = arr[d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key + ".q_proj.bias"] = arr[:d_model]
+                new_state_dict[new_key + ".k_proj.bias"] = arr[d_model : d_model * 2]
+                new_state_dict[new_key + ".v_proj.bias"] = arr[d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(
+                    lambda m: protected[re.escape(m.group(0))], new_key
+                )
+                new_state_dict[new_key] = arr
+    return new_state_dict
+
+
+# =========================#
+#    AITemplate mapping   #
+# =========================#
+def map_unet_state_dict(state_dict, dim=320):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            # print("ff.net.0.proj.weight")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            # print("ff.net.0.proj.bias")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def map_clip_state_dict(state_dict):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+class StableDiffusionAITPipeline:
+    def __init__(self, hf_hub_or_path, ckpt):
+        self.device = torch.device("cuda")
+        workdir = "tmp/"
+        state_dict = None
+        if ckpt is not None:
+            state_dict = torch.load(ckpt, map_location="cpu")
+            while "state_dict" in state_dict:
+                state_dict = state_dict["state_dict"]
+            clip_state_dict = {}
+            unet_state_dict = {}
+            vae_state_dict = {}
+            for key in state_dict.keys():
+                if key.startswith("cond_stage_model.transformer."):
+                    new_key = key.replace("cond_stage_model.transformer.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("cond_stage_model.model."):
+                    new_key = key.replace("cond_stage_model.model.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("first_stage_model."):
+                    new_key = key.replace("first_stage_model.", "")
+                    vae_state_dict[new_key] = state_dict[key]
+                elif key.startswith("model.diffusion_model."):
+                    new_key = key.replace("model.diffusion_model.", "")
+                    unet_state_dict[new_key] = state_dict[key]
+            clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
+            unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict)
+            vae_state_dict = convert_ldm_vae_checkpoint(vae_state_dict)
+            state_dict = None
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        print("Loading PyTorch CLIP")
+        if ckpt is None:
+            self.clip_pt = CLIPTextModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="text_encoder",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            config = CLIPTextConfig.from_pretrained(
+                hf_hub_or_path, subfolder="text_encoder"
+            )
+            self.clip_pt = CLIPTextModel(config)
+            clip_state_dict[
+                "text_model.embeddings.position_ids"
+            ] = self.clip_pt.text_model.embeddings.get_buffer("position_ids")
+            self.clip_pt.load_state_dict(clip_state_dict)
+        clip_params_ait = map_clip_state_dict(dict(self.clip_pt.named_parameters()))
+        print("Setting constants")
+        self.clip_ait_exe.set_many_constants_with_tensors(clip_params_ait)
+        print("Folding constants")
+        self.clip_ait_exe.fold_constants()
+        # cleanup
+        self.clip_pt = None
+        clip_params_ait = None
+
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="UNet2DConditionModel", workdir=workdir
+        )
+
+        print("Loading PyTorch UNet")
+        if ckpt is None:
+            self.unet_pt = UNet2DConditionModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="unet",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+            self.unet_pt = self.unet_pt.state_dict()
+        else:
+            self.unet_pt = unet_state_dict
+        unet_params_ait = map_unet_state_dict(self.unet_pt)
+        print("Setting constants")
+        self.unet_ait_exe.set_many_constants_with_tensors(unet_params_ait)
+        print("Folding constants")
+        self.unet_ait_exe.fold_constants()
+        # cleanup
+        self.unet_pt = None
+        unet_params_ait = None
+
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+        print("Loading PyTorch VAE")
+        if ckpt is None:
+            self.vae_pt = AutoencoderKL.from_pretrained(
+                hf_hub_or_path,
+                subfolder="vae",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            self.vae_pt = dict(vae_state_dict)
+
+        print("Mapping parameters...")
+        vae_params_ait = map_vae(self.vae_pt)
+        print("Setting constants")
+        self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
+        print("Folding constants")
+        self.vae_ait_exe.fold_constants()
+        # cleanup
+        self.vae_pt = None
+        vae_params_ait = None
+
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            hf_hub_or_path, subfolder="scheduler"
+        )
+        self.batch = 1
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def unet_inference(
+        self, latent_model_input, timesteps, encoder_hidden_states, height, width
+    ):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(self.batch * 2)
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height // 8
+            shape[2] = width // 8
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=77):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input, height, width):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            shape[1] = height
+            shape[2] = width
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        self.batch = batch_size
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+        # pytorch equivalent
+        # text_embeddings = self.clip_pt(text_input.input_ids.to(self.device)).last_hidden_state
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = self.device
+        latents_shape = (batch_size, 4, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for t in tqdm(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                height=height,
+                width=width,
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs
+            ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents, height, width)
+        # pytorch equivalent
+        # image = self.vae_pt.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
new file mode 100644
index 000000000..8c2230368
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_controlnet_ait.py
@@ -0,0 +1,1069 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+import os
+from typing import List, Optional, Union
+
+import torch
+from aitemplate.compiler import Model
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    EulerDiscreteScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.utils.pil_utils import numpy_to_pil
+from tqdm import tqdm
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
+
+from .compile_lib.compile_vae_alt import map_vae
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "query.weight")
+        new_item = new_item.replace("q.bias", "query.bias")
+
+        new_item = new_item.replace("k.weight", "key.weight")
+        new_item = new_item.replace("k.bias", "key.bias")
+
+        new_item = new_item.replace("v.weight", "value.weight")
+        new_item = new_item.replace("v.bias", "value.bias")
+
+        new_item = new_item.replace("proj_out.weight", "proj_attn.weight")
+        new_item = new_item.replace("proj_out.bias", "proj_attn.bias")
+
+        new_item = shave_segments(
+            new_item, n_shave_prefix_segments=n_shave_prefix_segments
+        )
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, additional_replacements=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(
+        paths, list
+    ), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    for path in paths:
+        new_path = path["new"]
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        if "proj_attn.weight" in new_path:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+# ================#
+# VAE Conversion #
+# ================#
+
+
+def convert_ldm_vae_checkpoint(vae_state_dict):
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict[
+        "encoder.conv_out.weight"
+    ]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict[
+        "encoder.norm_out.weight"
+    ]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict[
+        "encoder.norm_out.bias"
+    ]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict[
+        "decoder.conv_out.weight"
+    ]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict[
+        "decoder.norm_out.weight"
+    ]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict[
+        "decoder.norm_out.bias"
+    ]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "encoder.down" in layer
+        }
+    )
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key]
+        for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(
+        {
+            ".".join(layer.split(".")[:3])
+            for layer in vae_state_dict
+            if "decoder.up" in layer
+        }
+    )
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key]
+        for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [
+            key
+            for key in down_blocks[i]
+            if f"down.{i}" in key and f"down.{i}.downsample" not in key
+        ]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.weight")
+            new_checkpoint[
+                f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"
+            ] = vae_state_dict.pop(f"encoder.down.{i}.downsample.conv.bias")
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key
+            for key in up_blocks[block_id]
+            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.weight"]
+            new_checkpoint[
+                f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"
+            ] = vae_state_dict[f"decoder.up.{block_id}.upsample.conv.bias"]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path]
+    )
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+# =================#
+# UNet Conversion #
+# =================#
+def convert_ldm_unet_checkpoint(unet_state_dict, layers_per_block=2):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict[
+        "time_embed.0.weight"
+    ]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict[
+        "time_embed.0.bias"
+    ]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict[
+        "time_embed.2.weight"
+    ]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict[
+        "time_embed.2.bias"
+    ]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+    new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+    new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+    new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "input_blocks" in layer
+        }
+    )
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "middle_block" in layer
+        }
+    )
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len(
+        {
+            ".".join(layer.split(".")[:2])
+            for layer in unet_state_dict
+            if "output_blocks" in layer
+        }
+    )
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (layers_per_block + 1)
+
+        resnets = [
+            key
+            for key in input_blocks[i]
+            if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.weight"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.weight")
+            new_checkpoint[
+                f"down_blocks.{block_id}.downsamplers.0.conv.bias"
+            ] = unet_state_dict.pop(f"input_blocks.{i}.0.op.bias")
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {
+            "old": f"input_blocks.{i}.0",
+            "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}",
+        }
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path]
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+            meta_path = {
+                "old": f"input_blocks.{i}.1",
+                "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths,
+        new_checkpoint,
+        unet_state_dict,
+        additional_replacements=[meta_path],
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (layers_per_block + 1)
+        layer_in_block_id = i % (layers_per_block + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [
+                key for key in output_blocks[i] if f"output_blocks.{i}.1" in key
+            ]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {
+                "old": f"output_blocks.{i}.0",
+                "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}",
+            }
+            assign_to_checkpoint(
+                paths,
+                new_checkpoint,
+                unet_state_dict,
+                additional_replacements=[meta_path],
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(
+                    ["conv.bias", "conv.weight"]
+                )
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.weight"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.weight"]
+                new_checkpoint[
+                    f"up_blocks.{block_id}.upsamplers.0.conv.bias"
+                ] = unet_state_dict[f"output_blocks.{i}.{index}.conv.bias"]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths,
+                    new_checkpoint,
+                    unet_state_dict,
+                    additional_replacements=[meta_path],
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(
+                output_block_layers, n_shave_prefix_segments=1
+            )
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(
+                    [
+                        "up_blocks",
+                        str(block_id),
+                        "resnets",
+                        str(layer_in_block_id),
+                        path["new"],
+                    ]
+                )
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    return new_checkpoint
+
+
+# =========================#
+#    AITemplate mapping   #
+# =========================#
+def map_unet_state_dict(state_dict, dim=320):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            # print("ff.net.0.proj.weight")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            # print("ff.net.0.proj.bias")
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+
+    params_ait["arange"] = (
+        torch.arange(start=0, end=dim // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+def map_clip_state_dict(state_dict):
+    params_ait = {}
+    for key, arr in state_dict.items():
+        arr = arr.to("cuda", dtype=torch.float16)
+        name = key.replace("text_model.", "")
+        ait_name = name.replace(".", "_")
+        if name.endswith("out_proj.weight"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif name.endswith("out_proj.bias"):
+            ait_name = ait_name.replace("out_proj", "proj")
+        elif "q_proj" in name:
+            ait_name = ait_name.replace("q_proj", "proj_q")
+        elif "k_proj" in name:
+            ait_name = ait_name.replace("k_proj", "proj_k")
+        elif "v_proj" in name:
+            ait_name = ait_name.replace("v_proj", "proj_v")
+        params_ait[ait_name] = arr
+
+    return params_ait
+
+
+def map_controlnet_params(pt_mod):
+    pt_params = dict(pt_mod.named_parameters())
+    params_ait = {}
+    for key, arr in pt_params.items():
+        if len(arr.shape) == 4:
+            arr = arr.permute((0, 2, 3, 1)).contiguous()
+        elif key.endswith("ff.net.0.proj.weight"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        elif key.endswith("ff.net.0.proj.bias"):
+            w1, w2 = arr.chunk(2, dim=0)
+            params_ait[key.replace(".", "_")] = w1
+            params_ait[key.replace(".", "_").replace("proj", "gate")] = w2
+            continue
+        params_ait[key.replace(".", "_")] = arr
+    params_ait["controlnet_cond_embedding_conv_in_weight"] = torch.nn.functional.pad(
+        params_ait["controlnet_cond_embedding_conv_in_weight"], (0, 1, 0, 0, 0, 0, 0, 0)
+    )
+    params_ait["arange"] = (
+        torch.arange(start=0, end=320 // 2, dtype=torch.float32).cuda().half()
+    )
+    return params_ait
+
+
+class StableDiffusionAITPipeline:
+    def __init__(self, hf_hub_or_path, ckpt):
+        self.device = torch.device("cuda")
+        workdir = "tmp/"
+        state_dict = None
+        if ckpt is not None:
+            state_dict = torch.load(ckpt, map_location="cpu")
+            while "state_dict" in state_dict:
+                state_dict = state_dict["state_dict"]
+            clip_state_dict = {}
+            unet_state_dict = {}
+            vae_state_dict = {}
+            for key in state_dict.keys():
+                if key.startswith("cond_stage_model.transformer."):
+                    new_key = key.replace("cond_stage_model.transformer.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("cond_stage_model.model."):
+                    new_key = key.replace("cond_stage_model.model.", "")
+                    clip_state_dict[new_key] = state_dict[key]
+                elif key.startswith("first_stage_model."):
+                    new_key = key.replace("first_stage_model.", "")
+                    vae_state_dict[new_key] = state_dict[key]
+                elif key.startswith("model.diffusion_model."):
+                    new_key = key.replace("model.diffusion_model.", "")
+                    unet_state_dict[new_key] = state_dict[key]
+            # TODO: SD2.x clip support, get from diffusers convert_from_ckpt.py
+            # clip_state_dict = convert_text_enc_state_dict(clip_state_dict)
+            unet_state_dict = convert_ldm_unet_checkpoint(unet_state_dict)
+            vae_state_dict = convert_ldm_vae_checkpoint(vae_state_dict)
+            state_dict = None
+
+        self.controlnet_ait_exe = self.init_ait_module("ControlNetModel", "./tmp")
+        print("Loading PyTorch ControlNet")
+        controlnet_pt = ControlNetModel.from_pretrained(
+            "lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16
+        ).to("cuda")
+        controlnet_pt.eval()
+        ait_params = map_controlnet_params(controlnet_pt)
+        self.controlnet_ait_exe.set_many_constants_with_tensors(ait_params)
+        self.controlnet_ait_exe.fold_constants()
+        self.clip_ait_exe = self.init_ait_module(
+            model_name="CLIPTextModel", workdir=workdir
+        )
+        print("Loading PyTorch CLIP")
+        if ckpt is None:
+            self.clip_pt = CLIPTextModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="text_encoder",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            config = CLIPTextConfig.from_pretrained(
+                hf_hub_or_path, subfolder="text_encoder"
+            )
+            self.clip_pt = CLIPTextModel(config)
+            self.clip_pt.load_state_dict(clip_state_dict)
+        clip_params_ait = map_clip_state_dict(dict(self.clip_pt.named_parameters()))
+        print("Setting constants")
+        self.clip_ait_exe.set_many_constants_with_tensors(clip_params_ait)
+        print("Folding constants")
+        self.clip_ait_exe.fold_constants()
+        # cleanup
+        self.clip_pt = None
+        clip_params_ait = None
+
+        self.unet_ait_exe = self.init_ait_module(
+            model_name="ControlNetUNet2DConditionModel", workdir=workdir
+        )
+
+        print("Loading PyTorch UNet")
+        if ckpt is None:
+            self.unet_pt = UNet2DConditionModel.from_pretrained(
+                hf_hub_or_path,
+                subfolder="unet",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+            self.unet_pt = self.unet_pt.state_dict()
+        else:
+            self.unet_pt = unet_state_dict
+        unet_params_ait = map_unet_state_dict(self.unet_pt)
+        print("Setting constants")
+        self.unet_ait_exe.set_many_constants_with_tensors(unet_params_ait)
+        print("Folding constants")
+        self.unet_ait_exe.fold_constants()
+        # cleanup
+        self.unet_pt = None
+        unet_params_ait = None
+
+        self.vae_ait_exe = self.init_ait_module(
+            model_name="AutoencoderKL", workdir=workdir
+        )
+        print("Loading PyTorch VAE")
+        if ckpt is None:
+            self.vae_pt = AutoencoderKL.from_pretrained(
+                hf_hub_or_path,
+                subfolder="vae",
+                revision="fp16",
+                torch_dtype=torch.float16,
+            ).cuda()
+        else:
+            self.vae_pt = dict(vae_state_dict)
+
+        print("Mapping parameters...")
+        vae_params_ait = map_vae(self.vae_pt)
+        print("Setting constants")
+        self.vae_ait_exe.set_many_constants_with_tensors(vae_params_ait)
+        print("Folding constants")
+        self.vae_ait_exe.fold_constants()
+        # cleanup
+        self.vae_pt = None
+        vae_params_ait = None
+
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            "runwayml/stable-diffusion-v1-5", subfolder="scheduler"
+        )
+        self.batch = 1
+
+    def init_ait_module(
+        self,
+        model_name,
+        workdir,
+    ):
+        mod = Model(os.path.join(workdir, model_name, "test.so"))
+        return mod
+
+    def controlnet_inference(
+        self, latent_model_input, timesteps, encoder_hidden_states, controlnet_cond
+    ):
+        exe_module = self.controlnet_ait_exe
+        timesteps_pt = timesteps.expand(latent_model_input.shape[0])
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+            "input3": controlnet_cond.permute((0, 2, 3, 1)).contiguous().cuda().half(),
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        down_block_residuals = (y for y in ys[:-1])
+        mid_block_residuals = ys[-1]
+        return down_block_residuals, mid_block_residuals
+
+    def unet_inference(
+        self,
+        latent_model_input,
+        timesteps,
+        encoder_hidden_states,
+        height,
+        width,
+        down_block_residuals,
+        mid_block_residual,
+    ):
+        exe_module = self.unet_ait_exe
+        timesteps_pt = timesteps.expand(self.batch * 2)
+        inputs = {
+            "input0": latent_model_input.permute((0, 2, 3, 1))
+            .contiguous()
+            .cuda()
+            .half(),
+            "input1": timesteps_pt.cuda().half(),
+            "input2": encoder_hidden_states.cuda().half(),
+        }
+        for i, y in enumerate(down_block_residuals):
+            inputs[f"down_block_residual_{i}"] = y
+        inputs["mid_block_residual"] = mid_block_residual
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height // 8
+            shape[2] = width // 8
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        noise_pred = ys[0].permute((0, 3, 1, 2)).float()
+        return noise_pred
+
+    def clip_inference(self, input_ids, seqlen=77):
+        exe_module = self.clip_ait_exe
+        bs = input_ids.shape[0]
+        position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
+        inputs = {
+            "input0": input_ids,
+            "input1": position_ids,
+        }
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        return ys[0].float()
+
+    def vae_inference(self, vae_input, height, width):
+        exe_module = self.vae_ait_exe
+        inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
+        ys = []
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
+            shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
+            shape[1] = height
+            shape[2] = width
+            ys.append(torch.empty(shape).cuda().half())
+        exe_module.run_with_tensors(inputs, ys, graph_mode=False)
+        vae_out = ys[0].permute((0, 3, 1, 2)).float()
+        return vae_out
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        control_cond: torch.FloatTensor,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: Optional[float] = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined  as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        self.batch = batch_size
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.clip_inference(text_input.input_ids.to(self.device))
+        # pytorch equivalent
+        # text_embeddings = self.clip_pt(text_input.input_ids.to(self.device)).last_hidden_state
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            max_length = text_input.input_ids.shape[-1]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.clip_inference(
+                uncond_input.input_ids.to(self.device)
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        # get the initial random noise unless the user supplied it
+
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = self.device
+        latents_shape = (batch_size, 4, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(
+                    f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}"
+                )
+        latents = latents.to(self.device)
+
+        # set timesteps
+        accepts_offset = "offset" in set(
+            inspect.signature(self.scheduler.set_timesteps).parameters.keys()
+        )
+        extra_set_kwargs = {}
+        if accepts_offset:
+            extra_set_kwargs["offset"] = 1
+
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+            # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+
+        for t in tqdm(self.scheduler.timesteps):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            down_block_residuals, mid_block_residual = self.controlnet_inference(
+                latent_model_input, t, text_embeddings, control_cond
+            )
+            # predict the noise residual
+            noise_pred = self.unet_inference(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                height=height,
+                width=width,
+                down_block_residuals=down_block_residuals,
+                mid_block_residual=mid_block_residual,
+            )
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+
+            latents = self.scheduler.step(
+                noise_pred, t, latents, **extra_step_kwargs
+            ).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / 0.18215 * latents
+        image = self.vae_inference(latents, height, width)
+        # pytorch equivalent
+        # image = self.vae_pt.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
diff --git a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
similarity index 93%
rename from examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
rename to examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
index 7380aeebd..893db028d 100644
--- a/examples/05_stable_diffusion/pipeline_stable_diffusion_img2img_ait.py
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_img2img_ait.py
@@ -88,7 +88,6 @@ def __init__(
         feature_extractor: CLIPFeatureExtractor,
         requires_safety_checker: bool = True,
     ):
-        # super().__init__()
         super().__init__(
             vae=vae,
             text_encoder=text_encoder,
@@ -99,7 +98,7 @@ def __init__(
             feature_extractor=feature_extractor,
             requires_safety_checker=requires_safety_checker,
         )
-        scheduler = scheduler.set_format("pt")
+        # scheduler = scheduler.set_format("pt")
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -120,6 +119,7 @@ def __init__(
         self.vae_ait_exe = self.init_ait_module(
             model_name="AutoencoderKL", workdir=workdir
         )
+        self.batch = 1
 
     def init_ait_module(
         self,
@@ -141,15 +141,16 @@ def unet_inference(self, latent_model_input, timesteps, encoder_hidden_states):
             "input2": encoder_hidden_states.cuda().half(),
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch * 2
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         noise_pred = ys[0].permute((0, 3, 1, 2)).float()
         return noise_pred
 
-    def clip_inference(self, input_ids, seqlen=64):
+    def clip_inference(self, input_ids, seqlen=77):
         exe_module = self.clip_ait_exe
         bs = input_ids.shape[0]
         position_ids = torch.arange(seqlen).expand((bs, -1)).cuda()
@@ -158,9 +159,10 @@ def clip_inference(self, input_ids, seqlen=64):
             "input1": position_ids,
         }
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         return ys[0].float()
@@ -169,9 +171,10 @@ def vae_inference(self, vae_input):
         exe_module = self.vae_ait_exe
         inputs = [torch.permute(vae_input, (0, 2, 3, 1)).contiguous().cuda().half()]
         ys = []
-        num_ouputs = len(exe_module.get_output_name_to_index_map())
-        for i in range(num_ouputs):
+        num_outputs = len(exe_module.get_output_name_to_index_map())
+        for i in range(num_outputs):
             shape = exe_module.get_output_maximum_shape(i)
+            shape[0] = self.batch
             ys.append(torch.empty(shape).cuda().half())
         exe_module.run_with_tensors(inputs, ys, graph_mode=False)
         vae_out = ys[0].permute((0, 3, 1, 2)).float()
@@ -242,6 +245,7 @@ def __call__(
             raise ValueError(
                 f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
             )
+        self.batch = batch_size
 
         if strength < 0 or strength > 1:
             raise ValueError(
@@ -277,13 +281,12 @@ def __call__(
         if isinstance(self.scheduler, LMSDiscreteScheduler):
             timesteps = torch.tensor(
                 [num_inference_steps - init_timestep] * batch_size,
-                dtype=torch.long,
                 device=self.device,
-            )
+            ).to(dtype=torch.long)
         else:
             timesteps = self.scheduler.timesteps[-init_timestep]
-            timesteps = torch.tensor(
-                [timesteps] * batch_size, dtype=torch.long, device=self.device
+            timesteps = torch.tensor([timesteps] * batch_size, device=self.device).to(
+                dtype=torch.long
             )
 
         # add noise to latents using the timesteps
@@ -296,7 +299,7 @@ def __call__(
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=64,  # self.tokenizer.model_max_length,
+            max_length=self.tokenizer.model_max_length,
             truncation=True,
             return_tensors="pt",
         )
@@ -384,12 +387,15 @@ def __call__(
         image = image.cpu().permute(0, 2, 3, 1).numpy()
 
         # run safety checker
-        safety_cheker_input = self.feature_extractor(
-            self.numpy_to_pil(image), return_tensors="pt"
-        ).to(self.device)
-        image, has_nsfw_concept = self.safety_checker(
-            images=image, clip_input=safety_cheker_input.pixel_values
-        )
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="pt"
+            ).to(self.device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values
+            )
+        else:
+            has_nsfw_concept = None
 
         if output_type == "pil":
             image = self.numpy_to_pil(image)
diff --git a/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
new file mode 100644
index 000000000..4454c5f8a
--- /dev/null
+++ b/examples/05_stable_diffusion/src/pipeline_stable_diffusion_xl_ait.py
@@ -0,0 +1,892 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from aitemplate.compiler import Model
+
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import is_invisible_watermark_available, logging
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from .compile_lib.compile_clip_alt import map_clip
+from .compile_lib.compile_unet_alt import map_unet
+from .compile_lib.compile_vae_alt import map_vae
+
+from .inference_ait import (
+    clip_inference,
+    timestep_inference,
+    unet_inference,
+    vae_decode_inference,
+)
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import (
+        StableDiffusionXLWatermarker,
+    )
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+
+
+class StableDiffusionXLAITPipeline(
+    DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        text_encoder_module_path: str,
+        text_encoder_2_module_path: str,
+        unet_module_path: str,
+        vae_module_path: str,
+        timestep_module_path: str,
+        apply_weights_to_modules: bool = True,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(
+            force_zeros_for_empty_prompt=force_zeros_for_empty_prompt
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = (
+            add_watermarker
+            if add_watermarker is not None
+            else is_invisible_watermark_available()
+        )
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.text_encoder_module_path = text_encoder_module_path
+        self.text_encoder_2_module_path = text_encoder_2_module_path
+        self.unet_module_path = unet_module_path
+        self.vae_module_path = vae_module_path
+        self.text_encoder_exe = None
+        self.text_encoder_2_exe = None
+        self.unet_exe = None
+        self.vae_exe = None
+        self.timestep_exe = Model(timestep_module_path)
+        self.apply_clip()
+        self.apply_vae()
+        self.apply_unet()
+
+    def apply_vae(self):
+        self.vae_exe = Model(self.vae_module_path)
+        self.vae_exe.set_many_constants_with_tensors(map_vae(self.vae))
+
+    def apply_clip(self):
+        self.text_encoder_exe = Model(self.text_encoder_module_path)
+        self.text_encoder_exe.nlayers = [
+            x for x in range(0, self.text_encoder.config.num_hidden_layers)
+        ]
+        self.text_encoder_2_exe = Model(self.text_encoder_2_module_path)
+        self.text_encoder_2_exe.nlayers = [
+            x for x in range(0, self.text_encoder_2.config.num_hidden_layers)
+        ]
+        self.text_encoder_exe.set_many_constants_with_tensors(
+            map_clip(self.text_encoder)
+        )
+        self.text_encoder_2_exe.set_many_constants_with_tensors(
+            map_clip(self.text_encoder_2)
+        )
+
+    def apply_unet(self):
+        self.unet_exe = Model(self.unet_module_path)
+        self.unet_exe.set_many_constants_with_tensors(map_unet(self.unet))
+
+    def unload_clip(self):
+        self.text_encoder_exe = None
+        self.text_encoder_2_exe = None
+        torch.cuda.empty_cache()
+
+    def unload_unet(self):
+        self.unet_exe = None
+        torch.cuda.empty_cache()
+
+    def unload_vae(self):
+        self.vae_exe = None
+        torch.cuda.empty_cache()
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+
+        # Define tokenizers and text encoders
+        tokenizers = (
+            [self.tokenizer, self.tokenizer_2]
+            if self.tokenizer is not None
+            else [self.tokenizer_2]
+        )
+        text_encoders = (
+            [self.text_encoder_exe, self.text_encoder_2_exe]
+            if self.text_encoder_exe is not None
+            else [self.text_encoder_2_exe]
+        )
+
+        prompt_2 = prompt_2 or prompt
+        # textual inversion: procecss multi-vector tokens if necessary
+        prompt_embeds_list = []
+        prompts = [prompt, prompt_2]
+        for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+
+            # prompt_embeds = text_encoder(
+            #     text_input_ids.to(device),
+            #     output_hidden_states=True,
+            # )
+            prompt_embeds = clip_inference(text_encoder, text_input_ids, to_cpu=False, sync=False)
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            if "text_embeds" in prompt_embeds.keys():
+                pooled_prompt_embeds = prompt_embeds["text_embeds"]
+            else:
+                pooled_prompt_embeds = prompt_embeds["pooled_output"]
+            # pooled_prompt_embeds = prompt_embeds[0]
+            # prompt_embeds = prompt_embeds.hidden_states[-2] # -2 because it includes last hidden state, AIT does not so uses -1
+            prompt_embeds = prompt_embeds[f"hidden_state_{text_encoder.nlayers[-1]}"]
+
+            prompt_embeds_list.append(prompt_embeds)
+
+        prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = (
+            negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        )
+        if do_classifier_free_guidance and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(
+                uncond_tokens, tokenizers, text_encoders
+            ):
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                uncond_text_input_ids = uncond_input.input_ids
+
+                # negative_prompt_embeds = text_encoder(
+                #     uncond_input.input_ids.to(device),
+                #     output_hidden_states=True,
+                # )
+                negative_prompt_embeds = clip_inference(
+                    text_encoder, uncond_text_input_ids, to_cpu=False, sync=False
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                if "text_embeds" in negative_prompt_embeds.keys():
+                    negative_pooled_prompt_embeds = negative_prompt_embeds[
+                        "text_embeds"
+                    ]
+                else:
+                    negative_pooled_prompt_embeds = negative_prompt_embeds[
+                        "pooled_output"
+                    ]
+                # negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                # negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] # -2 because it includes last hidden state, AIT does not so uses -1
+                negative_prompt_embeds = negative_prompt_embeds[
+                    f"hidden_state_{text_encoder.nlayers[-1]}"
+                ]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds = prompt_embeds
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=self.text_encoder_2.dtype
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(
+            1, num_images_per_prompt
+        ).view(bs_embed * num_images_per_prompt, -1)
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(
+                1, num_images_per_prompt
+            ).view(bs_embed * num_images_per_prompt, -1)
+
+        return (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        elif prompt_2 is not None and (
+            not isinstance(prompt_2, str) and not isinstance(prompt_2, list)
+        ):
+            raise ValueError(
+                f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}"
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, dtype=dtype)
+        else:
+            latents = latents
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids)
+            + self.text_encoder_2.config.projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_embeds = []
+        for time_id in add_time_ids:
+            time_embed = timestep_inference(self.timestep_exe, time_id, to_cpu=False, sync=False)[
+                "time_embed"
+            ]
+            add_time_embeds.append(time_embed)
+
+        add_time_embeds = torch.cat(add_time_embeds, dim=-1).to(dtype=dtype)
+        return add_time_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+
+        
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+        )
+
+        # self.unload_clip()
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        ).cuda()
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [negative_pooled_prompt_embeds, add_text_embeds], dim=0
+            )
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds
+        add_text_embeds = add_text_embeds
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+        add_embeds = torch.cat([add_text_embeds, add_time_ids], dim=-1)
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+
+        # 7.1 Apply denoising_end
+        if (
+            denoising_end is not None
+            and type(denoising_end) == float
+            and denoising_end > 0
+            and denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(
+                list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))
+            )
+            timesteps = timesteps[:num_inference_steps]
+
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+
+                # predict the noise residual
+                # added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                # noise_pred = self.unet(
+                #     latent_model_input,
+                #     t,
+                #     encoder_hidden_states=prompt_embeds,
+                #     cross_attention_kwargs=cross_attention_kwargs,
+                #     added_cond_kwargs=added_cond_kwargs,
+                #     return_dict=False,
+                # )[0]
+                noise_pred = unet_inference(
+                    self.unet_exe,
+                    latent_model_input,
+                    t,
+                    prompt_embeds,
+                    add_embeds=add_embeds,
+                    to_cpu=False,
+                    sync=False
+                )["latent_output"]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        # self.unload_unet()
+
+        # self.apply_vae()
+
+        if not output_type == "latent":
+            image = vae_decode_inference(
+                self.vae_exe, latents / self.vae.config.scaling_factor, to_cpu=False, graph_mode=True, sync=False
+            )["pixels"]
+            # self.unload_vae()
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/examples/05_stable_diffusion/src/test_correctness.py b/examples/05_stable_diffusion/src/test_correctness.py
new file mode 100644
index 000000000..cc6a27852
--- /dev/null
+++ b/examples/05_stable_diffusion/src/test_correctness.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+
+import torch
+from diffusers import StableDiffusionPipeline
+
+try:
+    from libfb.py.asyncio.await_utils import await_sync
+    from manifold.clients.python import ManifoldClient
+except ImportError:
+    ManifoldClient = None
+
+from .benchmark import benchmark_clip, benchmark_unet, benchmark_vae
+from .compile_lib.compile_clip import compile_clip
+from .compile_lib.compile_unet import compile_unet
+from .compile_lib.compile_vae import compile_vae
+
+
+class StableDiffusionVerification(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(StableDiffusionVerification, self).__init__(*args, **kwargs)
+
+        self.local_path = "/tmp/aitemplate_stablediffusion_v2"
+        os.makedirs(self.local_path, exist_ok=True)
+
+        try:
+            pipe = StableDiffusionPipeline.from_pretrained(
+                self.local_path, revision="fp16", torch_dtype=torch.float16
+            ).to("cuda")
+        except OSError:
+            if ManifoldClient is not None:
+                with ManifoldClient.get_client(bucket="glow_test_data") as client:
+                    await_sync(
+                        client.getRecursive(
+                            manifold_path="tree/aitemplate/stable_diffusion/v2",
+                            local_path=self.local_path,
+                        )
+                    )
+
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    self.local_path, revision="fp16", torch_dtype=torch.float16
+                ).to("cuda")
+            else:
+                pipe = StableDiffusionPipeline.from_pretrained(
+                    "stabilityai/stable-diffusion-2",
+                    revision="fp16",
+                    torch_dtype=torch.float16,
+                    use_auth_token=os.environ.get("HUGGINGFACE_AUTH_TOKEN", True),
+                ).to("cuda")
+                pipe.save_pretrained(self.local_path)
+
+        self.pt_unet = pipe.unet
+        self.pt_vae = pipe.vae
+        self.pt_clip = pipe.text_encoder
+        self.tokenizer = pipe.tokenizer
+
+        self.vae_config = {
+            "batch_size": 1,
+            "width": 64,
+            "height": 64,
+        }
+
+        self.unet_config = {
+            "batch_size": 2,
+            "dim": 320,
+            "hidden_dim": pipe.unet.config.cross_attention_dim,
+            "width": 64,
+            "height": 64,
+        }
+
+        self.unet_compile_extra_config = {
+            "attention_head_dim": pipe.unet.config.attention_head_dim,
+        }
+
+        self.clip_config = {
+            "batch_size": 1,
+            "seqlen": 64,
+        }
+
+        self.clip_compile_extra_config = {
+            "depth": pipe.text_encoder.config.num_hidden_layers,
+            "num_heads": pipe.text_encoder.config.num_attention_heads,
+            "dim": pipe.text_encoder.config.hidden_size,
+            "act_layer": pipe.text_encoder.config.hidden_act,
+        }
+
+    def test_vae(self):
+        compile_vae(
+            self.pt_vae,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.vae_config,
+        )
+        benchmark_vae(
+            self.pt_vae,
+            benchmark_pt=False,
+            verify=True,
+            **self.vae_config,
+        )
+
+    def test_unet(self):
+        compile_unet(
+            self.pt_unet,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.unet_config,
+            **self.unet_compile_extra_config,
+        )
+        benchmark_unet(
+            self.pt_unet,
+            benchmark_pt=False,
+            verify=True,
+            **self.unet_config,
+        )
+
+    def test_clip(self):
+        compile_clip(
+            self.pt_clip,
+            use_fp16_acc=False,
+            convert_conv_to_gemm=True,
+            **self.clip_config,
+            **self.clip_compile_extra_config,
+        )
+        benchmark_clip(
+            self.pt_clip,
+            benchmark_pt=False,
+            verify=True,
+            tokenizer=self.tokenizer,
+            **self.clip_config,
+        )
diff --git a/examples/06_how_to_add_an_op/how_to_add_an_op.py b/examples/06_how_to_add_an_op/how_to_add_an_op.py
index cd1646aeb..4e0087cd9 100644
--- a/examples/06_how_to_add_an_op/how_to_add_an_op.py
+++ b/examples/06_how_to_add_an_op/how_to_add_an_op.py
@@ -71,15 +71,19 @@ def gen_function(self) -> str:
 
 {{func_signature}}
 {
-    invoke_add_one(output, input, num_elements, stream);
+    invoke_add_one(
+        static_cast<{{elem_type}}*>(output),
+        static_cast<const {{elem_type}}*>(input),
+        num_elements,
+        stream);
 }
     """
 )
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(half* output,
-                   const half* input,
+void {{func_name}}(void* output,
+                   const void* input,
                    const int64_t num_elements,
                    {{prefix}}Stream_t stream)
     """
@@ -108,14 +112,14 @@ def gen_function(self) -> str:
 
 KERNEL_TEMPLATE = jinja2.Template(
     """
-__global__ void add_one(half* output, const half* input, const int64_t num_elements) {
+__global__ void add_one({{elem_type}}* output, const {{elem_type}}* input, const int64_t num_elements) {
   const int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < num_elements) {
-    output[idx] = input[idx] + half(1.0);
+    output[idx] = input[idx] + {{elem_type}}(1.0);
   }
 }
 
-void invoke_add_one(half* output, const half* input, int64_t num_elements, {{prefix}}Stream_t stream) {
+void invoke_add_one({{elem_type}}* output, const {{elem_type}}* input, int64_t num_elements, {{prefix}}Stream_t stream) {
   if (num_elements < 1024) {
     dim3 grid(1);
     dim3 block(num_elements);
@@ -130,22 +134,12 @@ def gen_function(self) -> str:
 )
 
 
-FUNC_CALL_FP16_PARAM_TEMPLATE = jinja2.Template(
-    """reinterpret_cast<half*>(
-        {% if is_cuda %}&({% endif %}{{name}}{% if is_cuda %}->raw()){% endif %})"""
-)
-
-
-def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) -> str:
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 1
 
-    output_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
-    input_name = FUNC_CALL_FP16_PARAM_TEMPLATE.render(
-        name=func_attrs["inputs"][0]._attrs["name"], is_cuda=is_cuda
-    )
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    input_name = func_attrs["inputs"][0]._attrs["name"]
 
     dim_names = [dim._attrs["name"] for dim in func_attrs["inputs"][0].shape()]
     return FUNC_CALL_TEMPLATE.render(
@@ -158,10 +152,20 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
 
 def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+    input_x = func_attrs["inputs"][0]
+    output_y = func_attrs["outputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(input_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(output_y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
     prefix = backend_spec.prefix
+
     return FUNC_TEMPLATE.render(
         header_files=header_files,
-        kernel=KERNEL_TEMPLATE.render(prefix=prefix),
+        elem_type=input_type,
+        kernel=KERNEL_TEMPLATE.render(prefix=prefix, elem_type=input_type),
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"], prefix=prefix
         ),
@@ -194,7 +198,7 @@ def cuda_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
 
 @registry.reg("cuda.add_one.func_call")
 def cuda_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
-    return gen_function_call(func_attrs, indent, is_cuda=True)
+    return gen_function_call(func_attrs, indent)
 
 
 HIP_HEADER_FILES = """
@@ -215,7 +219,7 @@ def rocm_add_one_gen_function_decl(func_attrs: Dict[str, Any]) -> str:
 
 @registry.reg("rocm.add_one.func_call")
 def rocm_add_one_gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
-    return gen_function_call(func_attrs, indent, is_cuda=False)
+    return gen_function_call(func_attrs, indent)
 
 
 def create_ait_model(shapes):
diff --git a/examples/07_how_to_run_pt_model/how_to_run_pt_model.py b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
index 993b7c69f..f860f2c1b 100644
--- a/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
+++ b/examples/07_how_to_run_pt_model/how_to_run_pt_model.py
@@ -12,8 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from collections import OrderedDict
-
 import torch
 
 from aitemplate.compiler import compile_model
@@ -58,7 +56,7 @@ def forward(self, input):
 def map_pt_params(ait_model, pt_model):
     ait_model.name_parameter_tensor()
     pt_params = dict(pt_model.named_parameters())
-    mapped_pt_params = OrderedDict()
+    mapped_pt_params = {}
     for name, _ in ait_model.named_parameters():
         ait_name = name.replace(".", "_")
         assert name in pt_params
diff --git a/examples/07_how_to_run_pt_model/run_gemm_gemm_cat_fuse_rocm.py b/examples/07_how_to_run_pt_model/run_gemm_gemm_cat_fuse_rocm.py
deleted file mode 100644
index 8f820474e..000000000
--- a/examples/07_how_to_run_pt_model/run_gemm_gemm_cat_fuse_rocm.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-from collections import OrderedDict
-
-import torch
-
-from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import nn, Tensor
-from aitemplate.testing import detect_target
-from aitemplate.testing.benchmark_pt import benchmark_torch_function
-from aitemplate.utils.graph_utils import sorted_graph_pseudo_code
-
-
-class PTSimpleModel(torch.nn.Module):
-    def __init__(self, hidden, eps: float = 1e-5):
-        super().__init__()
-        self.dense1 = torch.nn.Linear(hidden, 2 * hidden)
-        self.layernorm = torch.nn.LayerNorm(2 * hidden)
-
-    def forward(self, input):
-        hidden_states_0 = self.dense1(input)
-        hidden_states_1 = self.layernorm(hidden_states_0)
-        hidden_states = torch.cat([hidden_states_0, hidden_states_1], dim=-1)
-        return hidden_states
-
-
-class AITSimpleModel(nn.Module):
-    def __init__(self, hidden, eps: float = 1e-5):
-        super().__init__()
-        self.dense1 = nn.Linear(hidden, 2 * hidden, specialization='mul_tanh')
-        self.layernorm = nn.LayerNorm(2 * hidden)
-
-    def forward(self, input):
-        hidden_states_0 = self.dense1(input, input)
-        hidden_states_1 = self.layernorm(hidden_states_0)
-        hidden_states = ops.concatenate()([hidden_states_0, hidden_states_1], dim=-1)
-        return hidden_states
-
-
-def map_pt_params(ait_model, pt_model):
-    ait_model.name_parameter_tensor()
-    pt_params = dict(pt_model.named_parameters())
-    mapped_pt_params = OrderedDict()
-    for name, _ in ait_model.named_parameters():
-        ait_name = name.replace(".", "_")
-        assert name in pt_params
-        mapped_pt_params[ait_name] = pt_params[name]
-    return mapped_pt_params
-
-
-def verify_simple_model(batch_size=1024, hidden=512):
-    # create pt model
-    pt_model = PTSimpleModel(hidden).cuda().half()
-
-    # create pt input
-    x = torch.randn([batch_size, hidden]).cuda().half()
-
-    # run pt model
-    pt_model.eval()
-    y_pt = pt_model(x)
-
-    # create ait model
-    ait_model = AITSimpleModel(hidden)
-    X = Tensor(
-        shape=[batch_size, hidden],
-        name="X",
-        dtype="float16",
-        is_input=True,
-    )
-    Y = ait_model(X)
-    Y._attrs["is_output"] = True
-    Y._attrs["name"] = "Y"
-
-    # map pt weights to ait
-    weights = map_pt_params(ait_model, pt_model)
-
-    # code gen
-    target = detect_target()
-    with compile_model(
-        Y, target, "./tmp", "simple_model_demo", constants=weights
-    ) as module:
-        # create storage for output tensor
-        y = torch.empty([batch_size, hidden * 4]).cuda().half()
-
-        # inputs and outputs dict
-        inputs = {"X": x}
-        outputs = {"Y": y}
-
-        # run
-        module.run_with_tensors(inputs, outputs, graph_mode=False)
-
-        #assert False
-
-        # verify output is correct
-        print(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
-
-        # benchmark ait and pt
-        count = 1000
-        ait_t, _, _ = module.benchmark_with_tensors(
-            inputs, outputs, graph_mode=True, count=count
-        )
-        print(f"AITemplate time: {ait_t} ms/iter")
-
-        pt_t = benchmark_torch_function(count, pt_model.forward, x)
-        print(f"PyTorch eager time: {pt_t} ms/iter")
-
-        # check out the fused graph
-        # there are only fused ops in the final graph
-        # gemm_rcr_bias_fast_gelu, gemm_rcr_bias_add, and concatenate
-        graph = module.debug_sorted_graph
-        print("Final graph:")
-        print(sorted_graph_pseudo_code(graph))
-
-
-verify_simple_model()
\ No newline at end of file
diff --git a/examples/process_results.py b/examples/process_results.py
index 19f013637..20963e51f 100644
--- a/examples/process_results.py
+++ b/examples/process_results.py
@@ -1,12 +1,16 @@
 #!/usr/bin/env python3
-import glob,os, io, argparse, datetime
-#import numpy as np
-import sqlalchemy
-from sqlalchemy.types import NVARCHAR, Float, Integer
-import pymysql
+import datetime
+import glob
+import io
+import os
+
 import pandas as pd
+
+# import numpy as np
+import sqlalchemy
 from sshtunnel import SSHTunnelForwarder
 
+
 def print_to_string(*args, **kwargs):
     output = io.StringIO()
     print(*args, file=output, **kwargs)
@@ -14,134 +18,214 @@ def print_to_string(*args, **kwargs):
     output.close()
     return contents
 
+
 def get_logfiles():
-    path = r'./**/*.log'
+    path = r"./**/*.log"
     files = glob.glob(path, recursive=True)
     files.sort()
     return files
 
+
 def get_log_params(logfile):
-    branch_name=' '
-    commit= ' '
-    node_id=' '
-    gpu_arch=' '
-    compute_units=0
-    ngpus=0
-    rocm_vers=' '
-    compiler_vers='release'
+    branch_name = " "
+    commit = " "
+    node_id = " "
+    gpu_arch = " "
+    compute_units = 0
+    ngpus = 0
+    rocm_vers = " "
+    compiler_vers = "release"
     for line in open(logfile):
-         if 'git_branch' in line:
-             lst=line.split()
-             branch_name=lst[1]
-         if 'commit' in line:
-             lst=line.split()
-             commit=lst[1]
-         if 'hostname' in line:
-             lst=line.split()
-             node_id=lst[1]
-         if 'GPU_arch' in line:
-             lst=line.split()
-             gpu_arch=lst[2]
-         if 'Name:                    gfx' in line:
-             ngpus=ngpus+1
-         if 'Compute Unit' in line:
-             lst=line.split()
-             compute_units=lst[2]
-         if 'InstalledDir' in line:
-             lst=line.split()
-             rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
-    return branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers
+        if "git_branch" in line:
+            lst = line.split()
+            branch_name = lst[1]
+        if "commit" in line:
+            lst = line.split()
+            commit = lst[1]
+        if "hostname" in line:
+            lst = line.split()
+            node_id = lst[1]
+        if "GPU_arch" in line:
+            lst = line.split()
+            gpu_arch = lst[2]
+        if "Name:                    gfx" in line:
+            ngpus = ngpus + 1
+        if "Compute Unit" in line:
+            lst = line.split()
+            compute_units = lst[2]
+        if "InstalledDir" in line:
+            lst = line.split()
+            rocm_vers = lst[1][
+                lst[1].find("/opt/rocm-")
+                + len("/opt/rocm-") : lst[1].rfind("/llvm/bin")
+            ]
+    return (
+        branch_name,
+        commit,
+        node_id,
+        gpu_arch,
+        compute_units,
+        ngpus,
+        rocm_vers,
+        compiler_vers,
+    )
+
 
 def parse_logfile(files):
-    glue=''
-    res=[]
-    tests=[]
+    # glue = ""
+    res = []
+    # tests = []
     for logfile in files:
-       if 'resnet50' in logfile or 'vit.log' in logfile:
-          init_bs=0
-          for line in open(logfile):
-              if 'batch_size:' in line:
-                 lst=line.split()
-                 lst[1]=int(lst[1].replace(',',''))
-                 if lst[1]>init_bs: #only grab first 9 results for different batch sizes from these tests
-                    init_bs=lst[1]
-                    res.append(lst[3])
-       if 'bert.log' in logfile:
-           for line in open(logfile):
-              if 'batch_size:' in line: #grab all 45 results from these tests
-                 lst=line.split()
-                 res.append(lst[5])
-       if 'sdiff.log' in logfile:
-           for line in open(logfile):
-              if 'sd e2e:' in line: #results for stable diffusion
-                lst=line.split()
-                res.append(lst[2])
+        if "resnet50" in logfile or "vit.log" in logfile:
+            init_bs = 0
+            for line in open(logfile):
+                if "batch_size:" in line:
+                    lst = line.split()
+                    lst[1] = int(lst[1].replace(",", ""))
+                    if (
+                        lst[1] > init_bs
+                    ):  # only grab first 9 results for different batch sizes from these tests
+                        init_bs = lst[1]
+                        res.append(lst[3])
+        if "bert.log" in logfile:
+            for line in open(logfile):
+                if "batch_size:" in line:  # grab all 45 results from these tests
+                    lst = line.split()
+                    res.append(lst[5])
+        if "sdiff.log" in logfile:
+            for line in open(logfile):
+                if "sd e2e:" in line:  # results for stable diffusion
+                    lst = line.split()
+                    res.append(lst[2])
     return res
 
+
 def get_baseline(table, connection):
-    query = '''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where git_branch='amd-develop' );'''
+    query = (
+        """SELECT * from """
+        + table
+        + """ WHERE Datetime = (SELECT MAX(Datetime) FROM """
+        + table
+        + """ where git_branch='amd-develop' );"""
+    )
     return pd.read_sql_query(query, connection)
 
-def store_new_test_result(table_name, test_results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, connection):
-    params=[str(node_id),str(branch_name),str(commit),str(gpu_arch),compute_units,ngpus,str(rocm_vers),str(compiler_vers),str(datetime.datetime.now())]
-    df=pd.DataFrame(data=[params],columns=['hostname','git_branch','git_commit','GPU_arch','Compute_units','number_of_gpus','ROCM_version','compiler_version','Datetime'])
-    df_add=pd.DataFrame(data=[test_results],columns=testlist)
-    df=pd.concat([df,df_add],axis=1)
-    print("new test results dataframe:",df)
-    df.to_sql(table_name,connection,if_exists='append',index=False)
+
+def store_new_test_result(
+    table_name,
+    test_results,
+    testlist,
+    node_id,
+    branch_name,
+    commit,
+    gpu_arch,
+    compute_units,
+    ngpus,
+    rocm_vers,
+    compiler_vers,
+    connection,
+):
+    params = [
+        str(node_id),
+        str(branch_name),
+        str(commit),
+        str(gpu_arch),
+        compute_units,
+        ngpus,
+        str(rocm_vers),
+        str(compiler_vers),
+        str(datetime.datetime.now()),
+    ]
+    df = pd.DataFrame(
+        data=[params],
+        columns=[
+            "hostname",
+            "git_branch",
+            "git_commit",
+            "GPU_arch",
+            "Compute_units",
+            "number_of_gpus",
+            "ROCM_version",
+            "compiler_version",
+            "Datetime",
+        ],
+    )
+    df_add = pd.DataFrame(data=[test_results], columns=testlist)
+    df = pd.concat([df, df_add], axis=1)
+    print("new test results dataframe:", df)
+    df.to_sql(table_name, connection, if_exists="append", index=False)
     return 0
 
-def compare_test_to_baseline(baseline,test,testlist):
-    regression=0
+
+def compare_test_to_baseline(baseline, test, testlist):
+    regression = 0
     if not baseline.empty:
-        base=baseline[testlist].to_numpy(dtype='float')
-        base_list=base[0]
-        ave_perf=0
+        base = baseline[testlist].to_numpy(dtype="float")
+        base_list = base[0]
+        ave_perf = 0
         for i in range(len(base_list)):
             # success criterion:
-            if base_list[i]>1.01*float(test[i]):
-                print("test # ",i,"shows regression by {:.3f}%".format(
-                    (float(test[i])-base_list[i])/base_list[i]*100))
-                regression=1
-            if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
-        if regression==0:
+            if base_list[i] > 1.01 * float(test[i]):
+                print(
+                    "test # ",
+                    i,
+                    "shows regression by {:.3f}%".format(
+                        (float(test[i]) - base_list[i]) / base_list[i] * 100
+                    ),
+                )
+                regression = 1
+            if base_list[i] > 0:
+                ave_perf = ave_perf + float(test[i]) / base_list[i]
+        if regression == 0:
             print("no regressions found")
-        ave_perf=ave_perf/len(base_list)
-        print("average performance relative to baseline:",ave_perf)
+        ave_perf = ave_perf / len(base_list)
+        print("average performance relative to baseline:", ave_perf)
     else:
         print("could not find a baseline")
     return regression
 
 
 def main():
-    files=get_logfiles()
-    results=[]
-    baseline=[]
-    testlist=[]
-    #parse the test parameters from the logfile
+    files = get_logfiles()
+    results = []
+    baseline = []
+    testlist = []
+    # parse the test parameters from the logfile
     for filename in files:
-        branch_name, commit, node_id, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers = get_log_params(filename)
-
-    print("Branch name:",branch_name)
-    print("Git_commit:",commit)
-    print("Node name:",node_id)
-    print("GPU_arch:",gpu_arch)
-    print("Compute units:",compute_units)
-    print("ROCM_version:",rocm_vers)
-    print("Compiler_version:",compiler_vers)
-    #parse results, get the Tflops value for "Best Perf" kernels
-    results=parse_logfile(files)
-
-    print("Number of tests:",len(results))
-    sql_hostname = '127.0.0.1'
+        (
+            branch_name,
+            commit,
+            node_id,
+            gpu_arch,
+            compute_units,
+            ngpus,
+            rocm_vers,
+            compiler_vers,
+        ) = get_log_params(filename)
+
+    print("Branch name:", branch_name)
+    print("Git_commit:", commit)
+    print("Node name:", node_id)
+    print("GPU_arch:", gpu_arch)
+    print("Compute units:", compute_units)
+    print("ROCM_version:", rocm_vers)
+    print("Compiler_version:", compiler_vers)
+    # parse results, get the Tflops value for "Best Perf" kernels
+    results = parse_logfile(files)
+
+    print("Number of tests:", len(results))
+    sql_hostname = "127.0.0.1"
     sql_username = os.environ["dbuser"]
     sql_password = os.environ["dbpassword"]
-    sql_main_database = 'sys'
+    sql_main_database = "sys"
     sql_port = 3306
     hostname = os.uname()[1]
-    if hostname == 'jwr-amd-132':
-        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}/{3}'.
-            format(sql_username, sql_password, sql_hostname, sql_main_database))
+    if hostname == "jwr-amd-132":
+        sqlEngine = sqlalchemy.create_engine(
+            "mysql+pymysql://{0}:{1}@{2}/{3}".format(
+                sql_username, sql_password, sql_hostname, sql_main_database
+            )
+        )
         conn = sqlEngine.connect()
     else:
         ssh_host = os.environ["dbsship"]
@@ -152,23 +236,45 @@ def main():
             (ssh_host, ssh_port),
             ssh_username=ssh_user,
             ssh_password=ssh_pass,
-            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
-                sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
-                    format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+            remote_bind_address=(sql_hostname, sql_port),
+        ) as tunnel:
+            sqlEngine = sqlalchemy.create_engine(
+                "mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(
+                    sql_username,
+                    sql_password,
+                    sql_hostname,
+                    tunnel.local_bind_port,
+                    sql_main_database,
+                )
+            )
         conn = sqlEngine.connect()
-    #save gemm performance tests:
-    for i in range(1,len(results)+1):
-        testlist.append("Test%i"%i)
-    table_name="ait_performance"
-        
-    baseline = get_baseline(table_name,conn)
-    store_new_test_result(table_name, results, testlist, node_id, branch_name, commit, gpu_arch, compute_units, ngpus, rocm_vers, compiler_vers, conn)
+    # save gemm performance tests:
+    for i in range(1, len(results) + 1):
+        testlist.append("Test%i" % i)
+    table_name = "ait_performance"
+
+    baseline = get_baseline(table_name, conn)
+    store_new_test_result(
+        table_name,
+        results,
+        testlist,
+        node_id,
+        branch_name,
+        commit,
+        gpu_arch,
+        compute_units,
+        ngpus,
+        rocm_vers,
+        compiler_vers,
+        conn,
+    )
     conn.close()
 
-    #compare the results to the baseline if baseline exists
-    regression=0
-    regression=compare_test_to_baseline(baseline,results,testlist)
+    # compare the results to the baseline if baseline exists
+    regression = 0
+    regression = compare_test_to_baseline(baseline, results, testlist)
     return regression
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/fx2ait/CMakeLists.txt b/fx2ait/CMakeLists.txt
new file mode 100644
index 000000000..bf6ef6211
--- /dev/null
+++ b/fx2ait/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+
+project(ait_model)
+find_package(Torch REQUIRED)
+
+include_directories(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../3rdparty/picojson
+)
+
+# Define our library target
+set(CMAKE_CXX_STANDARD 17)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../static/include)
+add_library(ait_model SHARED
+  ${CMAKE_CURRENT_SOURCE_DIR}/fx2ait/csrc/AITModel.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/fx2ait/csrc/AITModelImpl.cpp
+)
+
+# Link against LibTorch
+target_link_libraries(ait_model "${TORCH_LIBRARIES}")
diff --git a/fx2ait/README.md b/fx2ait/README.md
new file mode 100644
index 000000000..6f7e1e223
--- /dev/null
+++ b/fx2ait/README.md
@@ -0,0 +1,47 @@
+# FX2AIT for AITemplate
+
+
+FX2AIT is a Python-based tool that converts PyTorch models into AITemplate (AIT) engine for lightning-fast inference serving.
+AITLowerer built on top of FX2AIT is able to perform AIT conversion on PyTorch model with AIT unsupported operators. Model can enjoy partial AIT acceleration using AITLowerer.
+
+FX2AIT highlights include:
+
+- Automatic Conversion: FX2AIT only need PyTorch model and input as input for conversion. The output can be used for inference serving directly.
+- Expanded Support: AITemplate doesn't cover all operators PyTorch provides. FX2AIT provided AITLowerer as solution to support partial AIT conversion for models with AIT unsupported operators. For more information, please check example/03_lowering_split.
+
+## Installation
+
+**Hardware requirement:**
+  - **NVIDIA**: FX2AIT is based on AIT, thus the hardware requirement is same as AIT. AIT is only tested on SM80+ GPUs (Ampere etc). Not all kernels work with old SM75/SM70 (T4/V100) GPUs.
+### From Source
+The following command will create a Python wheel for AITemplate. Please ensure you have correct CUDA compiler installed.
+- CUDA: CUDA 11.6
+- cuDNN: v8.7.0 for CUDA 11.x
+  download source: https://developer.nvidia.com/rdp/cudnn-download
+  installation guidance: https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html
+
+*Incorrect compiler will lead performance regression.*
+
+```
+cd fx2ait
+python setup.py install
+```
+
+### Docker Image
+We highly recommend using AITemplate with Docker to avoid accidentally using a wrong version of NVCC or HIPCC.
+- CUDA: `./docker/build.sh cuda`
+
+This will build a docker image with tag `ait:latest`.
+
+## Examples
+AITemplate provides the following getting started tutorials:
+- 01: [How to inference a PyTorch Transformer model with FX2AIT](fx2ait/example/01_transformer_model/)
+- 02: [How to inference a PyTorch vision model with FX2AIT](fx2ait/example/02_vision_model/)
+- 03: [How to inference a general PyTorch model with AIT unsupported operator using AIT Lowerer](fx2ait/example/03_lowering_split/)
+### Run Example and Test
+Example command:
+```
+cd fx2ait
+python example/03_lowering_split/test_lower.py
+python test/test_ait_lower.py
+```
diff --git a/fx2ait/fx2ait/__init__.py b/fx2ait/fx2ait/__init__.py
new file mode 100644
index 000000000..d68b25ab9
--- /dev/null
+++ b/fx2ait/fx2ait/__init__.py
@@ -0,0 +1,30 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import acc_tracer, converters, extension  # noqa
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
+
+__all__ = [
+    "acc_tracer",
+    "converters",
+    "core",
+    "extension",
+    "lower",
+    "test",
+]
diff --git a/fx2ait/fx2ait/acc_tracer/__init__.py b/fx2ait/fx2ait/acc_tracer/__init__.py
new file mode 100644
index 000000000..024b7058e
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/__init__.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import (  # noqa
+    acc_normalizer,
+    acc_op_properties,
+    acc_ops,
+    acc_shape_prop,
+    acc_tracer,
+    acc_utils,
+    ait_acc_normalizer,
+    ait_acc_ops,
+    ait_acc_ops_registry,
+)
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
+
+__all__ = [
+    "acc_normalizer",
+    "acc_op_properties",
+    "acc_ops",
+    "acc_shape_prop",
+    "acc_tracer",
+    "acc_utils",
+    "ait_acc_normalizer",
+    "ait_acc_ops_registry",
+    "ait_acc_ops",
+]
diff --git a/fx2ait/fx2ait/acc_tracer/acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
new file mode 100644
index 000000000..9295dffc6
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_normalizer.py
@@ -0,0 +1,490 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+import logging
+import re
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Union
+
+import torch
+import torch.fx
+from torch.fx.node import _get_qualified_name
+
+from . import acc_utils
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+# Need to keep up-to-date with https://fburl.com/codesearch/7r2hhh53
+ALIAS_MAP = {
+    "input": ("input", "x", "a", "x1"),
+    "dim": ("dim", "axis"),
+    "keepdim": ("keepdim", "keepdims"),
+    "other": ("other", "x2"),
+}
+
+# Type used for arg replacement tuples. The list represents the argument signature of
+# some callable. Each item in the list is a tuple, where for each member of a tuple:
+# - The first member is union of either:
+#   - A tuple of all potential alias kwarg str names of the source signature, or
+#   - A tuple of a single str representing the single kwarg name allowed.
+# - The second member is the str name of the kwarg to map it to. This is either from the
+#   signature of the acc_op, or for custom mapped nodes from the original unnormalized op.
+# - The third member is a bool representing whether this arg is optional, i.e. whether it
+#   is allowed to not be present in the original input args.
+ArgReplacementTuplesType = List[Tuple[Tuple[str, ...], str, bool]]
+
+
+class NormalizationInfo(NamedTuple):
+    """
+    Holds normalization info for some FX node, where the FX node will be mapped either
+    via new_fn_target and arg_replacement_tuples, or via custom_mapping_fn.
+
+    If via new_fn_target and arg_replacement_tuples:
+      - new_fn_target is the target function to replace the original node with
+        (generally some function from acc_ops).
+
+      - arg_replacement_tuples describes how to map the original FX node's args/kwargs to
+        the new FX node. If set to None, then the kwargs are copied directly from the
+        original FX node. Else, this is list of three-member tuples, where each tuple
+        represents a mapping from either an arg or kwarg in the original FX node to the
+        kwarg it should be mapped to. If for ops registered with `register_acc_op` then
+        this is a mapping to the the new FX node for the acc_op. Otherwise it is for some
+        op registered with `register_custom_acc_mapper_fn`, in which case this is a
+        mapping for the original input node so its args are normalized to kwargs before
+        being custom normalized to acc_ops. The third member of the tuple is a bool
+        representing whether this argument is optional; if False and the arg is not
+        present then an assertion will be thrown. The index of the tuple indicates where
+        the original arg is in node.args and the string name indicates which original
+        kwarg it is.
+
+    If via custom_mapping_fn, then custom_mapping_fn is some function that takes the
+    original FX node as input and returns the FX node that should replace it. This means
+    it was registered via `register_custom_acc_mapper_fn`.
+    """
+
+    new_fn_target: Callable
+    arg_replacement_tuples: Optional[ArgReplacementTuplesType]
+    custom_mapping_fn: Optional[Callable]
+    # either (tensor_meta_field_name, original_field_name, move_to_qparams) or
+    # (tensor_meta_field_name, orginal_field_name)
+    # when move_to_qparams is True, we'll move the field to qparams
+    # dictionary, otherwise it will stay in TensorMeta itself
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ]
+    needs_shapes_for_normalization: bool
+    skip_normalization_if_none: bool
+
+
+# Dict from (op, target) to NormalizationInfo for that op.
+_normalization_dict: Dict[Tuple[str, Union[str, Callable]], NormalizationInfo] = {}
+
+# Set of all the acc ops.
+_acc_ops: Set[Callable] = set()
+
+
+def _insert_fun(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[Tuple],
+    new_fn_target: Optional[Callable] = None,
+    custom_mapping_fn: Optional[Callable] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+    skip_normalization_if_none=False,
+):
+    if op_and_target[0] == "call_function":
+        assert callable(op_and_target[1])
+    elif op_and_target[0] == "call_method":
+        assert isinstance(op_and_target[1], str)
+    elif op_and_target[0] == "call_module":
+        assert isinstance(op_and_target[1], type)
+
+    # Finalize arg replacement tuples.
+    # 1. Check to see if they have the `is_optional` bool, and if not defaulting it to
+    #   False.
+    # 2. Some kwargs might have aliases. e.g. "a", "x" and "x1" are aliases of "input".
+    #   Here we replace `orig_kwarg` with a tuple of all aliases if it has aliases.
+    final_arg_replacement_tuples = []
+    for arg_replacement_tuple in arg_replacement_tuples:
+        if len(arg_replacement_tuple) == 2:
+            orig_kwarg, new_kwarg, is_optional = *arg_replacement_tuple, False
+        else:
+            assert len(arg_replacement_tuple) == 3
+            orig_kwarg, new_kwarg, is_optional = arg_replacement_tuple
+
+        if not isinstance(orig_kwarg, tuple):
+            orig_kwarg = (orig_kwarg,)
+
+        # Use set to avoid duplicates.
+        orig_kwarg_set = set(orig_kwarg)
+
+        for k in orig_kwarg:
+            if k in ALIAS_MAP:
+                orig_kwarg_set.update(ALIAS_MAP[k])
+        final_arg_replacement_tuples.append(
+            (tuple(orig_kwarg_set), new_kwarg, is_optional)
+        )
+
+    assert op_and_target not in _normalization_dict.keys()
+    norm_info = NormalizationInfo(
+        new_fn_target=new_fn_target,  # type: ignore[arg-type]
+        arg_replacement_tuples=final_arg_replacement_tuples,
+        custom_mapping_fn=custom_mapping_fn,
+        kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+        needs_shapes_for_normalization=needs_shapes_for_normalization,
+        skip_normalization_if_none=skip_normalization_if_none,
+    )
+    _normalization_dict[op_and_target] = norm_info
+
+    # If allow_normalize_from_torch_package then add another entry to
+    # _normalization_dict where we look for the qualified name of the target with the
+    # torch_package module prefix. Note that we leave off any integer at the end of
+    # "<torch_package_>" in order to allow for whatever mangling index is used.
+    if allow_normalize_from_torch_package:
+        torch_package_op_and_target = (
+            op_and_target[0],  # type: ignore[]
+            f"<torch_package_>.{_get_qualified_name(op_and_target[1])}",  # type: ignore[arg-type]
+        )
+        _normalization_dict[torch_package_op_and_target] = norm_info
+
+
+def _get_dup_signature_tuples(fn: Callable) -> List[Tuple[str, str]]:
+    """
+    Helper that inspects the arg signature of `fn` and returns a list of tuples, where
+    each tuple is a pair of duplicated names which is used for arg_replacement_tuples.
+    """
+    sig_tuples: List[Tuple[str, str]] = []
+    for param in inspect.signature(inspect.unwrap(fn)).parameters:
+        sig_tuples.append((param, param))
+    return sig_tuples
+
+
+def register_acc_op(acc_op: Callable):
+    """
+    For a new acc op, add this as decorator to register it.
+    """
+    _acc_ops.add(acc_op)
+    return acc_op
+
+
+def register_acc_op_mapping(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+    allow_normalize_from_torch_package=False,
+):
+    """
+    Use this decorator to map a non-acc operator to an acc operator.
+
+    Args:
+        op_and_target: A tuple that contains op and target of the node that represents the non-acc operator.
+        arg_replacement_tuples: Please refer to the comment on above for `ArgReplacementTuplesType`.
+        kwargs_to_move_to_acc_out_ty: The kwargs we want to move out from the non-acc op kwargs to acc_out_ty.
+    """
+
+    def insert(new_fn_target: Callable):
+        # If arg_replacement_tuples is None then assume we use the same signature for
+        # the acc_op and the original op.
+        if arg_replacement_tuples is None:
+            final_arg_replacement_tuples = _get_dup_signature_tuples(new_fn_target)
+        else:
+            final_arg_replacement_tuples = arg_replacement_tuples  # type: ignore[assignment]
+
+        _insert_fun(
+            op_and_target=op_and_target,
+            new_fn_target=new_fn_target,
+            arg_replacement_tuples=final_arg_replacement_tuples,  # type: ignore[arg-type]
+            kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+        )
+        return new_fn_target
+
+    return insert
+
+
+def register_custom_acc_mapper_fn(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ],
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+    skip_normalization_if_none=False,
+):
+    def insert(custom_mapping_fn: Callable):
+        _insert_fun(
+            op_and_target=op_and_target,
+            custom_mapping_fn=custom_mapping_fn,
+            arg_replacement_tuples=arg_replacement_tuples,  # type: ignore[arg-type]
+            needs_shapes_for_normalization=needs_shapes_for_normalization,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+            skip_normalization_if_none=skip_normalization_if_none,
+        )
+        return custom_mapping_fn
+
+    return insert
+
+
+def move_kwargs_to_acc_out_ty(
+    node_or_normalization_info: Union[NormalizationInfo, torch.fx.Node],
+    new_kwargs: Dict[str, Any],
+):
+    """
+    Given `node_or_normalization_info` which is either NormalizationInfo for a node, or
+    a node to fetch NormalizationInfo for, check if kwargs_to_move_to_acc_out_ty exists
+    in the NormalizationInfo, and if so perform the move of kwargs to acc_out_ty.
+    """
+
+    if isinstance(node_or_normalization_info, torch.fx.Node):
+        node = node_or_normalization_info
+        normalization_info = _normalization_dict.get((node.op, node.target))
+    else:
+        assert isinstance(node_or_normalization_info, NormalizationInfo)
+        normalization_info = node_or_normalization_info
+
+    assert normalization_info is not None
+    if normalization_info.kwargs_to_move_to_acc_out_ty is None:
+        return
+
+    assert acc_utils.is_acc_op_with_kwarg(
+        normalization_info.new_fn_target, "acc_out_ty"
+    )
+
+    # Build a dict representing the new TensorMetadata to use for acc_out_ty,
+    # and then remove the kwarg from the new_kwargs since it's passed in via
+    # acc_out_ty instead.
+    tmd_dict: Dict[str, Any] = {}
+    qparams: Dict[str, Any] = {}
+
+    for kwarg_replacement_tuple in normalization_info.kwargs_to_move_to_acc_out_ty:
+        if len(kwarg_replacement_tuple) == 2:
+            orig_kwarg_name, tmd_field_name, move_to_qparams = *kwarg_replacement_tuple, False  # type: ignore[misc]
+        else:
+            assert len(kwarg_replacement_tuple) == 3
+            orig_kwarg_name, tmd_field_name, move_to_qparams = kwarg_replacement_tuple  # type: ignore[misc]
+        if move_to_qparams:
+            qparams[tmd_field_name] = new_kwargs[orig_kwarg_name]
+        else:
+            tmd_dict[tmd_field_name] = new_kwargs[orig_kwarg_name]
+        del new_kwargs[orig_kwarg_name]
+
+    tmd_dict["qparams"] = qparams
+    # Note: allow_partial_spec here because we are only using the tensor metadata tuple
+    # here to pass specific values into the function. For example, for quantization we
+    # only need to provide qparams dictionary, but is_quantized is
+    # not passed in.
+    new_kwargs["acc_out_ty"] = acc_utils.build_raw_tensor_meta(**tmd_dict)
+
+
+def get_normalized_kwargs(
+    node: torch.fx.Node, arg_replacement_tuples: ArgReplacementTuplesType
+):
+    new_kwargs = {}
+    final_arg_is_varg = False
+    for i, replacement_tuple in enumerate(arg_replacement_tuples):
+        orig_kwargs_names, new_kwarg_name, is_optional = replacement_tuple
+
+        # Check if this is a varg and if so break/process the rest outside the loop.
+        if "*" in orig_kwargs_names:
+            assert len(orig_kwargs_names) == 1
+            assert i == len(arg_replacement_tuples) - 1
+            final_arg_is_varg = True
+            break
+
+        # If nothing is found in node.kwargs it means the kwarg is in node.arg
+        # or it's optional. In this case, we set orig_kwargs_name to None.
+        assert isinstance(orig_kwargs_names, tuple)
+        orig_kwargs_name = next(
+            (key for key in orig_kwargs_names if key in node.kwargs),
+            None,
+        )
+
+        # If can't find in node.kwargs then it should be in the i index
+        # of node.args.
+        if orig_kwargs_name is None:
+            if i < len(node.args):
+                new_kwargs[new_kwarg_name] = node.args[i]
+            else:
+                # Verify the arg we're trying to normalize was optional.
+                assert (
+                    is_optional
+                ), f"Cannot normalize {orig_kwargs_names} to {new_kwarg_name} for {node.name}"
+        else:
+            new_kwargs[new_kwarg_name] = node.kwargs[orig_kwargs_name]
+
+    # If using var args then process the rest of the args now.
+    if final_arg_is_varg:
+        var_arg_idx = len(arg_replacement_tuples) - 1
+        new_kwarg_name = arg_replacement_tuples[var_arg_idx][1]
+        rest_of_args = []
+        for i in range(var_arg_idx, len(node.args)):
+            rest_of_args.append(node.args[i])
+        new_kwargs[new_kwarg_name] = rest_of_args
+
+    return new_kwargs
+
+
+def normalize(
+    mod: torch.fx.GraphModule,
+    expect_nodes_have_shapes: bool = False,
+    acc_normalization_block_list: Optional[
+        Set[Tuple[str, Union[str, Callable]]]
+    ] = None,
+):
+    assert len(_normalization_dict) > 0
+    graph = mod.graph
+    if acc_normalization_block_list is None:
+        acc_normalization_block_list = set()
+
+    # For "call_module" node we return _base_class_origin if it's a
+    # RewrittenModule, otherwise, return its type. For other nodes,
+    # we return node.target.
+    def get_target(mod: torch.fx.GraphModule, node: torch.fx.Node):
+        if node.op != "call_module":
+            return node.target
+
+        # Find the module that node.target points to
+        m = dict(mod.named_modules())[node.target]
+        return getattr(m, "_base_class_origin", type(m))
+
+    def normalize_to_acc_op(
+        node: torch.fx.Node,
+        normalization_info: NormalizationInfo,
+        normalized_args: Tuple[Any, ...],
+        normalized_kwargs: Dict[str, Any],
+    ):
+        # If there's a custom mapping function then use it.
+        if normalization_info.custom_mapping_fn is not None:
+            # For custom mapping, the normalized_kwargs are used for the original op,
+            # i.e. *before* custom acc_ops normalization. Do that now.
+            if normalization_info.skip_normalization_if_none:
+                original_args = node.args
+                original_kwargs = node.kwargs
+            node.args = normalized_args
+            node.kwargs = normalized_kwargs
+            new_node = normalization_info.custom_mapping_fn(node, mod)
+            # If a new node is returned then use it to replace the old node. Otherwise
+            # the custom mapping function did its own replacement, so return early.
+            if new_node is None:
+                if normalization_info.skip_normalization_if_none:
+                    node.args = original_args
+                    node.kwargs = original_kwargs
+                return
+        else:
+            # If there's kwargs_to_move_to_acc_out_ty then use it to setup acc_out_ty in
+            # normalized_kwargs, and remove the kwarg from normalized_kwargs.
+            move_kwargs_to_acc_out_ty(normalization_info, normalized_kwargs)
+
+            # All acc ops are functions. Create a call to the correct acc_ops target using
+            # the normalized kwargs provided.
+            with graph.inserting_before(node):
+                new_node = graph.create_node(
+                    "call_function",
+                    normalization_info.new_fn_target,
+                    args=normalized_args,
+                    kwargs=normalized_kwargs,
+                    name=node.name,
+                )
+                new_node.meta = node.meta.copy()
+
+        # Finally replace the original node with the normalized node.
+        node.replace_all_uses_with(new_node)
+        graph.erase_node(node)
+
+        # Don't wrap the acc_op node just because the original node was wrapped.
+        if "is_wrapped" in new_node.meta:
+            del new_node.meta["is_wrapped"]
+
+    for node in graph.nodes:
+        if node.op in {"placeholder", "get_attr", "output"}:
+            continue
+
+        op_and_target = (node.op, get_target(mod, node))
+
+        if op_and_target in acc_normalization_block_list:
+            continue
+
+        normalization_info = _normalization_dict.get(op_and_target)
+
+        # Also check if the torch_packaged version of the op was specified to be normalized.
+        if normalization_info is None and node.op == "call_function":
+            # Strip off the mangle_index suffix here before checking the map.
+            target = re.sub(
+                r"\A<torch_package_\d+>",
+                "<torch_package_>",
+                _get_qualified_name(node.target),
+            )
+            torch_package_op_and_target = (node.op, target)
+            normalization_info = _normalization_dict.get(torch_package_op_and_target)
+
+        if normalization_info is None:
+            continue
+
+        # Get the normalized kwargs to be used by normalize_to_acc_op below. If
+        # normalization_info.arg_replacement_tuples is empty then assume the function
+        # signature must be left as is.
+        assert normalization_info.arg_replacement_tuples is not None
+        if len(normalization_info.arg_replacement_tuples) == 0:
+            normalized_args = node.args
+            normalized_kwargs = node.kwargs
+        else:
+            normalized_args = ()
+            try:
+                normalized_kwargs = get_normalized_kwargs(
+                    node, normalization_info.arg_replacement_tuples
+                )
+            except Exception:
+                _LOGGER.error(
+                    f"Error during kwarg normalization for: {node.format_node()}; "
+                    f"arg_replacement_tuples={normalization_info.arg_replacement_tuples}"
+                )
+                raise
+
+        if (
+            normalization_info.needs_shapes_for_normalization
+            and not expect_nodes_have_shapes
+        ):
+            # All nodes needing shapes for normalization should be custom mapped.
+            assert normalization_info.custom_mapping_fn is not None
+            # For custom mapping, the normalized_kwargs are used for the original op,
+            # i.e. *before* custom acc_ops normalization. Do that now so that whoever
+            # consumes the graph next (e.g. shape inference) can use kwargs safely.
+            node.args = normalized_args
+            node.kwargs = normalized_kwargs
+            continue
+
+        try:
+            normalize_to_acc_op(
+                node, normalization_info, normalized_args, normalized_kwargs
+            )
+        except Exception:
+            _LOGGER.error(f"Error during normalization for node: {node.format_node()}")
+            raise
+
+    # If there are any dead nodes left after normalization, eliminate them now.
+    mod.graph.eliminate_dead_code()
diff --git a/fx2ait/fx2ait/acc_tracer/acc_op_properties.py b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
new file mode 100644
index 000000000..895ad9a97
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_op_properties.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import defaultdict
+from enum import auto, Flag
+from typing import Callable, DefaultDict, Set
+
+import torch
+import torch.fx
+
+
+class AccOpProperty(Flag):
+    """
+    A collection of static properties for acc_ops.
+
+    * pointwise - op commutes with data restructuring ops such as reshape,
+        transpose, permute. e.g. op(reshape(x)) == reshape(op(x)).
+        Alternatively, for tensor x = (x1, x2, ...), there exists a scalar
+        function f such that op(x) = (f(x1), f(x2), ...).
+    * quantized - op expects quantized inputs and return quantized outputs
+    * unary - op has exactly one graph dependent input. e.g. relu,
+        dequantize, sum
+    """
+
+    pointwise = auto()
+    quantized = auto()
+    unary = auto()
+
+
+acc_op_properties: DefaultDict[Callable, Set[AccOpProperty]] = defaultdict(set)
+acc_ops_with_property: DefaultDict[AccOpProperty, Set[Callable]] = defaultdict(set)
+
+
+def register_acc_op_properties(*properties: AccOpProperty):
+    """
+    Attach properties to acc_op to inform optimization
+    """
+
+    def decorator(acc_op: Callable):
+        acc_op_properties[acc_op] |= set(properties)
+        for prop in properties:
+            acc_ops_with_property[prop].add(acc_op)
+        return acc_op
+
+    return decorator
+
+
+def add_optimization_properties_to_meta(mod: torch.fx.GraphModule) -> None:
+    """
+    Add acc_op properties to Node.meta to inform optimization
+    """
+    for node in mod.graph.nodes:
+        node.meta["acc_op_properties"] = acc_op_properties[node.target]
diff --git a/fx2ait/fx2ait/acc_tracer/acc_ops.py b/fx2ait/fx2ait/acc_tracer/acc_ops.py
new file mode 100644
index 000000000..4d078abe9
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_ops.py
@@ -0,0 +1,3410 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# encoding: utf-8
+import logging
+import operator
+
+import torch  # isort:skip
+from typing import cast, Iterable, List, Optional, Sequence
+
+import torch.nn as nn
+from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata
+
+from . import acc_utils
+from .acc_normalizer import (
+    register_acc_op,
+    register_acc_op_mapping,
+    register_custom_acc_mapper_fn,
+)
+from .acc_op_properties import AccOpProperty, register_acc_op_properties
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+this_arg_is_optional = True
+move_to_qparams = True
+dont_move_to_qparams = False
+
+# A proxy embedding size. We use this for tracing proxy operators using XL
+# weights which we can't load into memory (because they're too large), we
+# instead substitute a smaller weight with embedding size =
+# PROXY_EMBEDDING_SIZE.
+PROXY_EMBEDDING_SIZE = 8
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.linear))
+@register_acc_op
+def linear(*, input, weight, bias):
+    return nn.functional.linear(input=input, weight=weight, bias=bias)
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op
+def quantized_linear(*, input, weight, bias, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return nn.quantized.functional.linear(
+        input,
+        weight,
+        bias,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "flatten"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("start_dim", "start_dim", this_arg_is_optional),
+        ("end_dim", "end_dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(op_and_target=("call_function", torch.flatten))
+@register_acc_op
+def flatten(*, input, start_dim=0, end_dim=-1):
+    return torch.flatten(input=input, start_dim=start_dim, end_dim=end_dim)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "squeeze"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.squeeze),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def squeeze(*, input, dim=None):
+    if dim is None:
+        return input.squeeze()
+    return input.squeeze(dim=dim)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding))
+@register_acc_op
+def embedding(
+    *,
+    input,
+    weight,
+    padding_idx,
+    max_norm,
+    norm_type,
+    scale_grad_by_freq,
+    sparse,
+):
+    return torch.nn.functional.embedding(**locals())
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool1d))
+@register_acc_op
+def max_pool1d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    return_indices,
+):
+    return nn.functional.max_pool1d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool2d))
+@register_acc_op
+def max_pool2d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+    return_indices,
+):
+    return nn.functional.max_pool2d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool3d))
+@register_acc_op
+def max_pool3d(
+    *, input, kernel_size, stride, padding, dilation, ceil_mode, return_indices
+):
+    return nn.functional.max_pool3d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        ceil_mode=ceil_mode,
+        return_indices=return_indices,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.adaptive_avg_pool2d)
+)
+@register_acc_op
+def adaptive_avg_pool2d(*, input, output_size):
+    return nn.functional.adaptive_avg_pool2d(input=input, output_size=output_size)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.adaptive_avg_pool3d)
+)
+@register_acc_op
+def adaptive_avg_pool3d(*, input, output_size):
+    return nn.functional.adaptive_avg_pool3d(input=input, output_size=output_size)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool1d))
+@register_acc_op
+def avg_pool1d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+):
+    return nn.functional.avg_pool1d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool2d))
+@register_acc_op
+def avg_pool2d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    return nn.functional.avg_pool2d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+        divisor_override=divisor_override,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool3d))
+@register_acc_op
+def avg_pool3d(
+    *,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override,
+):
+    return nn.functional.avg_pool3d(
+        input=input,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        ceil_mode=ceil_mode,
+        count_include_pad=count_include_pad,
+        divisor_override=divisor_override,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sign))
+@register_acc_op
+def sign(*, input):
+    return torch.sign(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "type"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def custom_type_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_obj = node.kwargs["input"]
+    dtype_obj = node.kwargs.get("dtype")
+    with node.graph.inserting_before(node):
+        if dtype_obj is None:
+            dtype_node = node.graph.call_function(dtype, kwargs={"input": input_obj})
+            dtype_node.meta["type"] = torch.dtype
+            return dtype_node
+        else:
+            new_kwargs = {
+                "input": input_obj,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_obj),
+            }
+            new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+            new_node.meta = node.meta
+            return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "type_as"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("tensor", "tensor"),
+    ],
+)
+def custom_type_as_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_obj = node.kwargs["input"]
+    other_obj = node.kwargs["tensor"]
+    with node.graph.inserting_before(node):
+        dtype_node = node.graph.call_function(dtype, kwargs={"input": other_obj})
+        dtype_node.meta["type"] = torch.dtype
+        device_node = node.graph.call_function(device, kwargs={"input": other_obj})
+        device_node.meta["type"] = torch.device
+
+        new_kwargs = {
+            "input": input_obj,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_node),
+            "device": device_node,
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def dtype(*, input):
+    return input.dtype
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def size(*, input):
+    return input.size()
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def device(*, input):
+    return input.device
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.numel))
+@register_acc_op
+def numel(*, input):
+    return torch.numel(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", getattr),
+    arg_replacement_tuples=[],
+)
+def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Custom function for mapping a call_function getattr to other ops.
+
+    Supports:
+    * getattr on a torch.Tensor with "shape", "device", or "dtype" attributes
+    * getattr for accessing named tuples
+    """
+    # Have to use args here since getattr forces positional args.
+    input_obj = node.args[0]
+    attr_name = node.args[1]
+    assert isinstance(input_obj, torch.fx.Node)
+    input_obj_type = input_obj.meta["type"]
+
+    # Handle named tuple access. NamedTupleMeta and the namedtuple factory function
+    # create a subclass of tuple with an extra _fields attribute.
+    if issubclass(input_obj_type, tuple) and hasattr(input_obj_type, "_fields"):
+        idx = None
+        for i, name in enumerate(input_obj_type._fields):
+            if name == attr_name:
+                idx = i
+                break
+        assert (
+            idx is not None
+        ), f"Named tuple type {input_obj_type} does not have field {name}"
+
+        with node.graph.inserting_before(node):
+            getitem_node = node.graph.call_function(
+                getitem, kwargs={"input": input_obj, "idx": idx}
+            )
+            getitem_node.meta = node.meta.copy()
+            return getitem_node
+
+    assert input_obj_type in [
+        torch.Tensor,
+        torch.nn.parameter.Parameter,
+    ], f"Expected torch.Tensor type for {input_obj_type}"
+    assert (
+        attr_name == "shape" or attr_name == "device" or attr_name == "dtype"
+    ), f"Only supporting shape, device and dtype getattr for now, not {attr_name}"
+    if attr_name == "shape":
+        func = size
+    elif attr_name == "device":
+        func = device
+    elif attr_name == "dtype":
+        func = dtype
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(func, kwargs={"input": input_obj})
+        size_node.meta = node.meta.copy()
+        return size_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "size"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+def tensor_size_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from Tensor.size() to acc_ops.size. We map size() to acc_ops.size directly
+    and map size(dim) to acc_ops.size + acc_ops.getitem.
+    """
+
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(
+            size, kwargs={"input": node.kwargs["input"]}
+        )
+
+        if "dim" not in node.kwargs:
+            size_node.meta = node.meta.copy()
+            return size_node
+
+        size_node.meta["type"] = torch.Size
+        getitem_node = node.graph.call_function(
+            getitem, kwargs={"input": size_node, "idx": node.kwargs["dim"]}
+        )
+        getitem_node.meta = node.meta.copy()
+        return getitem_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.add))
+@register_acc_op_mapping(op_and_target=("call_method", "add"))
+@register_acc_op
+def add(*, input, other):
+    if not (isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor)):
+        return operator.add(input, other)
+    else:
+        return input + other
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze))
+@register_acc_op
+def unsqueeze(*, input, dim: int):
+    return torch.unsqueeze(input=input, dim=dim)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "tile"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.tile))
+@register_acc_op
+def tile(*, input, dims):
+    return torch.tile(input=input, dims=dims)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "repeat"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "sizes"),
+    ],
+    skip_normalization_if_none=True,
+)
+def repeat_mapper(node: torch.fx.Node, _: nn.Module) -> Optional[torch.fx.Node]:
+    """
+    Map repeat to tile.
+    """
+    with node.graph.inserting_before(node):
+        inputs = node.kwargs["input"]
+        dims = node.kwargs["sizes"]
+        # Skip repeat mapping when the list of dims is not all ints (ie. contains
+        # some calculated value). torch.tile cannot support cases where dims
+        # are Proxy nodes
+        if (
+            isinstance(dims, (list, tuple))
+            and len(dims) > 0
+            and not all(isinstance(x, int) for x in dims)
+        ):
+            logger.info(
+                "Not mapping repeat to an acc op. We can't handle variable dims."
+            )
+            return
+        new_node = node.graph.create_node(
+            "call_function",
+            tile,
+            kwargs={"input": inputs, "dims": dims},
+            name=f"{node.name}_repeat_map",
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "repeat_interleave"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("repeats", "repeats"),
+        ("dim", "dim", this_arg_is_optional),
+        ("output_size", "output_size", this_arg_is_optional),
+    ],
+    skip_normalization_if_none=True,
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.repeat_interleave),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("repeats", "repeats"),
+        ("dim", "dim", this_arg_is_optional),
+        ("output_size", "output_size", this_arg_is_optional),
+    ],
+    skip_normalization_if_none=True,
+)
+def repeat_interleave_mapper(node: torch.fx.Node, _: nn.Module):
+    input_node = node.kwargs["input"]
+    repeats = cast(int, node.kwargs["repeats"])
+    dim = node.kwargs["dim"]
+    if not (type(repeats) is int):
+        logger.info(
+            "Not mapping repeat_interleave to an acc op. We currently only support `repeat_interleave` with int repeats"
+        )
+        return
+    assert (
+        type(repeats) is int
+    ), "We currently only support `repeat_interleave` with int repeats"
+    rank = node.meta["tensor_rank"]
+    if dim is None:
+        repeat_dim = rank - 1
+    else:
+        assert type(dim) is int, "dim should be an int"
+        repeat_dim = dim
+    tile_dims = [1] * (rank + 1)
+    tile_dims[repeat_dim + 1] = repeats
+
+    with node.graph.inserting_before(node):
+        unsqueeze_node = node.graph.create_node(
+            "call_function",
+            unsqueeze,
+            kwargs={"input": input_node, "dim": repeat_dim + 1},
+            name=f"{node.name}_unsqueeze",
+        )
+        tile_node = node.graph.create_node(
+            "call_function",
+            tile,
+            kwargs={"input": unsqueeze_node, "dims": tuple(tile_dims)},
+            name=f"{node.name}_repeat_interleave_map_tile",
+        )
+        new_shape = []
+        if dim is not None:
+            if dim < 0:
+                repeat_dim = dim + rank
+            else:
+                repeat_dim = dim
+            size_node = node.graph.create_node(
+                "call_function",
+                size,
+                kwargs={"input": input_node},
+                name=f"{node.name}_size",
+            )
+            size_node.meta["type"] = torch.Size
+            for i in range(rank):
+                shape_i = node.graph.create_node(
+                    "call_function",
+                    getitem,
+                    kwargs={"input": size_node, "idx": i},
+                    name=f"{node.name}_size_{i}",
+                )
+                if i == repeat_dim:
+                    new_shape.append(-1)
+                else:
+                    new_shape.append(shape_i)
+        else:
+            new_shape.append(-1)
+
+        reshaped_node = node.graph.create_node(
+            "call_function",
+            reshape,
+            kwargs={
+                "input": tile_node,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=new_shape),
+            },
+            name=f"{node.name}_reshape",
+        )
+        reshaped_node.meta = node.meta.copy()
+        return reshaped_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.stack),
+    arg_replacement_tuples=[
+        ("tensors", "tensors"),
+        ("dim", "dim"),
+    ],
+)
+def stack_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Map torch.stack to unsqueeze + cat.
+    """
+    with node.graph.inserting_before(node):
+        inputs = node.kwargs["tensors"]
+        unsqueeze_nodes = []
+        assert isinstance(inputs, Sequence)
+        for i, t in enumerate(inputs):
+            new_node = node.graph.create_node(
+                "call_function",
+                unsqueeze,
+                kwargs={"input": t, "dim": node.kwargs["dim"]},
+                name=f"{node.name}_unsqueeze_{i}",
+            )
+            new_node.meta["type"] = torch.Tensor
+            unsqueeze_nodes.append(new_node)
+        cat_node = node.graph.create_node(
+            "call_function",
+            cat,
+            kwargs={"tensors": unsqueeze_nodes, "dim": node.kwargs["dim"]},
+        )
+        cat_node.meta = node.meta.copy()
+        return cat_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.clamp))
+@register_acc_op_mapping(op_and_target=("call_function", torch.clip))
+@register_acc_op_mapping(
+    op_and_target=("call_function", nn.functional.hardtanh),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("min_val", "min"),
+        ("max_val", "max"),
+    ],
+)
+@register_acc_op_mapping(op_and_target=("call_method", "clamp"))
+@register_acc_op_mapping(op_and_target=("call_method", "clip"))
+@register_acc_op
+def clamp(*, input, min=None, max=None):
+    return torch.clamp(input=input, min=min, max=max)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.concat))
+@register_acc_op_mapping(op_and_target=("call_function", torch.cat))
+@register_acc_op
+def cat(*, tensors, dim):
+    return torch.cat(tensors=tensors, dim=dim)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.transpose),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim0", "dim0"),
+        ("dim1", "dim1"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "transpose"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim0", "dim0"),
+        ("dim1", "dim1"),
+    ],
+)
+def transpose_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    # Get the dim-permutation/shuffle
+    ranks = node.meta["tensor_rank"]
+    shuffle = list(range(ranks))
+    dim0 = cast(int, node.kwargs["dim0"])
+    dim1 = cast(int, node.kwargs["dim1"])
+    shuffle[dim0] = dim1
+    shuffle[dim1] = dim0
+
+    # Create the new acc_ops.permute node. Update all uses of the transpose
+    # node and then delete the transpose node.
+    with node.graph.inserting_after(node):
+        permute_node = node.graph.call_function(
+            the_function=permute,
+            kwargs={
+                "input": node.kwargs.get("input"),
+                "permutation": shuffle,
+            },
+        )
+        permute_node.meta = node.meta.copy()
+        node.replace_all_uses_with(permute_node)
+
+    permute_node.graph.erase_node(node)
+    return permute_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "contiguous"))
+@register_acc_op
+def contiguous(*, input):
+    return input.contiguous()
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "softmax"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.softmax),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def softmax(*, input, dim, dtype=None):
+    """
+    _stacklevel are ignored here.
+    """
+    return torch.nn.functional.softmax(input=input, dim=dim, dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.addmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat1", "mat1"),
+        ("mat2", "mat2"),
+        ("beta", "beta"),
+        ("alpha", "alpha"),
+    ],
+)
+def addmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.addmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1
+    then we also insert acc_ops.mul to the right place.
+    """
+    with node.graph.inserting_before(node):
+        mm_kwargs = {"input": node.kwargs["mat1"], "other": node.kwargs["mat2"]}
+        mm_node = node.graph.create_node(
+            "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_mm"
+        )
+        mm_node.meta = node.meta.copy()
+
+        if node.kwargs["alpha"] != 1:
+            mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]}
+            mm_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul"
+            )
+        mm_node.meta = node.meta.copy()
+
+        input_node = node.kwargs["input"]
+        if node.kwargs["beta"] != 1:
+            mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]}
+            new_input_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul"
+            )
+            assert isinstance(input_node, torch.fx.Node)
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {"input": mm_node, "other": input_node}
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.addcmul),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("tensor1", "tensor1"),
+        ("tensor2", "tensor2"),
+        ("value", "value"),
+    ],
+)
+def addcmul_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.addcmul to acc_ops.mul and acc_ops.add. If value is not 1, then we do another acc_ops.mul again.
+    """
+
+    with node.graph.inserting_before(node):
+        mul_kwargs = {"input": node.kwargs["tensor1"], "other": node.kwargs["tensor2"]}
+        mul_node = node.graph.create_node(
+            "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_mul"
+        )
+        mul_node.meta = node.meta.copy()
+
+        input_node = mul_node
+        if node.kwargs["value"] != 1:
+            value_mul_kwargs = {"input": input_node, "other": node.kwargs["value"]}
+            new_input_node = node.graph.create_node(
+                "call_function",
+                mul,
+                kwargs=value_mul_kwargs,
+                name="{mul_node.name}_value_mul",
+            )
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {
+            "input": node.kwargs["input"],
+            "other": input_node,
+        }
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.t),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "t"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def t_mapper(node: torch.fx.Node, _: nn.Module):
+    ranks = node.meta["tensor_rank"]
+    shuffle = [1, 0] if (ranks > 1) else [0]
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.create_node(
+            "call_function",
+            permute,
+            kwargs={"input": node.kwargs["input"], "permutation": shuffle},
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "permute"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "permutation"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.permute),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dims", "permutation"),
+    ],
+)
+@register_acc_op
+def permute(*, input, permutation):
+    return input.permute(*permutation)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.square),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def square_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": input_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "mm"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", operator.matmul),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.bmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.mm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mat2", "other"),
+    ],
+)
+@register_acc_op_mapping(op_and_target=("call_function", torch.matmul))
+@register_acc_op
+def matmul(*, input, other):
+    return torch.matmul(input=input, other=other)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout1d),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout2d),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.dropout3d),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "detach"), arg_replacement_tuples=[("input", "input")]
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.detach),
+    arg_replacement_tuples=[("input", "input")],
+)
+def dropout_mapper(node: torch.fx.Node, mod: nn.Module):
+    """
+    Remove dropout node and directly map its input to output.
+    """
+    return node.kwargs["input"]
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.hardsigmoid))
+@register_acc_op
+def hardsigmoid(*, input):
+    return nn.functional.hardsigmoid(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.silu),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def silu(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        sigmoid_node = node.graph.call_function(sigmoid, kwargs={"input": input_node})
+        sigmoid_node.meta = node.meta.copy()
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": sigmoid_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", nn.functional.hardswish),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_sigmoid_node = node.graph.call_function(
+            hardsigmoid, kwargs={"input": input_node}
+        )
+        new_sigmoid_node.meta = node.meta.copy()
+        new_node = node.graph.call_function(
+            mul, kwargs={"input": new_sigmoid_node, "other": input_node}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.add),
+    arg_replacement_tuples=[
+        ("qa", "input"),
+        ("qb", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_add(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.add(
+        input,
+        other,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.mul),
+    arg_replacement_tuples=[
+        ("qa", "input"),
+        ("qb", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_mul(*, input, other, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.mul(
+        input,
+        other,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.quantize_per_tensor),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+        ("dtype", "dtype"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+        ("dtype", "dtype", dont_move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    dtype = acc_out_ty.dtype
+    return torch.quantize_per_tensor(
+        input, qparams["scale"], qparams["zero_point"], dtype
+    )
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.quantize_per_channel),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("scales", "scales"),
+        ("zero_points", "zero_points"),
+        ("axis", "axis"),
+        ("dtype", "dtype"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scales", "scale", move_to_qparams),
+        ("zero_points", "zero_point", move_to_qparams),
+        ("axis", "axis", move_to_qparams),
+        ("dtype", "dtype", dont_move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    qparams = acc_out_ty.qparams
+    dtype = acc_out_ty.dtype
+    return torch.quantize_per_channel(
+        input,
+        torch.tensor(qparams["scale"]),
+        torch.tensor(qparams["zero_point"]),
+        qparams["axis"],
+        dtype,
+    )  # type: ignore[call-overload]
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_method", "dequantize"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize))
+@register_acc_op
+def dequantize(*, input):
+    return torch.dequantize(input)
+
+
+@register_acc_op_properties(
+    AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized
+)
+@register_acc_op
+def rescale_quantize_per_tensor(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    d = dequantize(input=input)
+    return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty)
+
+
+@register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized)
+@register_acc_op
+def rescale_quantize_per_channel(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    d = dequantize(input=input)
+    return quantize_per_channel(input=d, acc_out_ty=acc_out_ty)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sub))
+@register_acc_op_mapping(op_and_target=("call_function", operator.sub))
+@register_acc_op_mapping(op_and_target=("call_method", "sub"))
+@register_acc_op
+def sub(*, input, other):
+    if not (isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor)):
+        return operator.sub(input, other)
+    else:
+        return input - other
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.mul))
+@register_acc_op_mapping(op_and_target=("call_function", operator.mul))
+@register_acc_op_mapping(op_and_target=("call_method", "mul"))
+@register_acc_op
+def mul(*, input, other):
+    return input * other
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "div"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("rounding_mode", "rounding_mode", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.div),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("rounding_mode", "rounding_mode", this_arg_is_optional),
+    ],
+)
+def div_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        div_kwargs = dict(node.kwargs)
+        if "rounding_mode" not in div_kwargs or div_kwargs["rounding_mode"] is None:
+            div_node = node.graph.call_function(
+                div, kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]}
+            )
+        elif div_kwargs["rounding_mode"] == "trunc":
+            div_node = node.graph.call_function(
+                trunc_div,
+                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
+            )
+        elif div_kwargs["rounding_mode"] == "floor":
+            div_node = node.graph.call_function(
+                floor_div,
+                kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]},
+            )
+        else:
+            raise RuntimeError(
+                f"Unhandled div rounding mode {div_kwargs['rounding_mode']}"
+            )
+        div_node.meta = node.meta.copy()
+        return div_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.truediv))
+@register_acc_op
+def div(*, input, other):
+    return input / other
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.floordiv))
+@register_acc_op
+def floor_div(*, input, other):
+    # This is temp fix because currently operator.floor_div for tensors would
+    # traslate into torch.floor_divide which would throw an error. After it's
+    # fixed we can stick to `input // other`.
+    if isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor):
+        return torch.div(input, other, rounding_mode="floor")
+    return input // other
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.floor_divide))
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op
+def trunc_div(*, input, other):
+    return torch.div(input, other, rounding_mode="trunc")
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.pow))
+@register_acc_op_mapping(op_and_target=("call_method", "pow"))
+@register_acc_op
+def pow(*, input, exponent):
+    return torch.pow(input, exponent)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu))
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.relu),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "relu"),
+    arg_replacement_tuples=[("input", "input")],
+)
+@register_acc_op
+def relu(*, input, inplace=False):
+    return nn.functional.relu(input=input, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.leaky_relu)
+)
+@register_acc_op
+def leaky_relu(*, input, negative_slope=0.01, inplace=False):
+    return nn.functional.leaky_relu(
+        input=input, negative_slope=negative_slope, inplace=inplace
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.elu))
+@register_acc_op
+def elu(*, input, alpha=1.0, inplace=False):
+    return nn.functional.elu(input=input, alpha=alpha, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.selu))
+@register_acc_op
+def selu(*, input, inplace=False):
+    return nn.functional.selu(input=input, inplace=inplace)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softsign))
+@register_acc_op
+def softsign(*, input):
+    return nn.functional.softsign(input=input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.log1p),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        add_kwargs = {"input": node.kwargs["input"], "other": 1.0}
+        add_node = node.graph.call_function(add, kwargs=add_kwargs)
+        add_node.meta = node.meta.copy()
+        log_kwargs = {"input": add_node}
+        log_node = node.graph.call_function(log, kwargs=log_kwargs)
+        log_node.meta = node.meta.copy()
+        return log_node
+
+
+def reduce_op_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule, func
+) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        kwargs = dict(node.kwargs)
+        if "dim" in kwargs and isinstance(kwargs["dim"], int):
+            kwargs["dim"] = (kwargs["dim"],)
+        new_node = node.graph.call_function(func, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def sum(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.sum(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.sum(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "sum"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.sum),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    return reduce_op_mapper(node, mod, sum)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def prod(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.prod(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.prod(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "prod"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.prod),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def prod_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    func = prod
+    with node.graph.inserting_before(node):
+        kwargs = dict(node.kwargs)
+        new_node = node.graph.call_function(func, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def mean(*, input, dim=None, keepdim=False, dtype=None):
+    if dim is not None:
+        return torch.mean(input, dim=dim, keepdim=keepdim, dtype=dtype)
+    else:
+        return input.mean(dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "mean"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.mean),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def mean_mapper(node, mod):
+    return reduce_op_mapper(node, mod, mean)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "std"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.std),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+        ("dtype", "dtype", this_arg_is_optional),
+    ],
+)
+def std_mapper(node, mod):
+    """
+    Formula of std: sqrt(sum(pow(X-mean(X))))/N)
+    This op is mapped to a few existing ops
+    """
+    input_node = node.kwargs["input"]
+    # unbiased = node.kwargs.get("unbiased")
+    dim = node.kwargs.get("dim")
+    keepdim = node.kwargs.get("keepdim")
+    # assert unbiased is False or unbiased is None, "We currently do not support `std` with unbiased=True where n-1 is used"
+    assert (
+        dim is not None and keepdim is not None
+    ), "We currently do not support `std` with dim=None and keepdim=None"
+
+    with node.graph.inserting_before(node):
+        # mean(X)
+        mean_kwargs = {
+            "input": input_node,
+            "dim": dim,
+            "keepdim": True,
+        }
+        mean_node = node.graph.call_function(mean, kwargs=mean_kwargs)
+        mean_node.meta["type"] = torch.Tensor
+        # X-mean(X)
+        sub_kwargs = {
+            "input": input_node,
+            "other": mean_node,
+        }
+        sub_node = node.graph.call_function(sub, kwargs=sub_kwargs)
+        sub_node.meta["type"] = torch.Tensor
+        # pow(X-mean(X))
+        pow_kwargs = {
+            "input": sub_node,
+            "exponent": 2.0,
+        }
+        pow_node = node.graph.call_function(pow, kwargs=pow_kwargs)
+        pow_node.meta["type"] = torch.Tensor
+        # mean(pow(X-mean(X)))
+        post_mean_kwargs = {
+            "input": pow_node,
+            "dim": dim,
+            "keepdim": keepdim,
+        }
+        post_mean_node = node.graph.call_function(mean, kwargs=post_mean_kwargs)
+        post_mean_node.meta["type"] = torch.Tensor
+        # sqrt( mean(pow(X-mean(X))) )
+        sqrt_kwargs = {
+            "input": post_mean_node,
+        }
+        sqrt_node = node.graph.call_function(sqrt, kwargs=sqrt_kwargs)
+        sqrt_node.meta["type"] = torch.Tensor
+
+        output_node = sqrt_node
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "max"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.max),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "min"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.min),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("dim", "other"), "dim_or_other", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+def add_maximum_minimum_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule
+) -> torch.fx.Node:
+    # there are effectively three versions of torch.max / torch.min
+    # full reduce: torch.max(input) -> Tensor
+    # dimensional reduce: torch.max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor)
+    # elementwise: torch.max(input, other, *, out=None) -> Tensor
+
+    # the mapper function is remapping for both min and max situations
+    # this helper function makes the choices available clearer and provides an easier way
+    # to lookup the right function
+    def target_map(op, target):
+        if (op, target) in (("call_method", "max"), ("call_function", torch.max)):
+            return {
+                "full_reduce": max_full_reduce,
+                "dim_reduce": max_dim_reduce,
+                "elementwise": maximum,
+            }
+        elif (op, target) in (("call_method", "min"), ("call_function", torch.min)):
+            return {
+                "full_reduce": min_full_reduce,
+                "dim_reduce": min_dim_reduce,
+                "elementwise": minimum,
+            }
+
+    with node.graph.inserting_before(node):
+        new_targets = target_map(node.op, node.target)
+        max_kwargs = {}
+        max_kwargs["input"] = node.kwargs["input"]
+        if ("dim_or_other" not in node.kwargs) or (node.kwargs["dim_or_other"] is None):
+            nt = new_targets["full_reduce"]
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        elif isinstance(node.kwargs["dim_or_other"], int):
+            nt = new_targets["dim_reduce"]
+            dim = node.kwargs["dim_or_other"]
+            max_kwargs["dim"] = dim
+            max_kwargs["keepdim"] = node.kwargs.get("keepdim", False)
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        else:
+            other = node.kwargs["dim_or_other"]
+            assert isinstance(other, torch.fx.Node)
+            # Lowering path for when provided "other", where we do elem-wise max
+            nt = new_targets["elementwise"]
+            max_kwargs["other"] = other
+            max_node = node.graph.call_function(nt, kwargs=max_kwargs)
+        max_node.meta = node.meta.copy()
+        return max_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def max_full_reduce(*, input):
+    return torch.max(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def max_dim_reduce(*, input, dim=None, keepdim=False):
+    return torch.max(input=input, dim=dim, keepdim=keepdim)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.maximum))
+@register_acc_op_mapping(op_and_target=("call_method", "maximum"))
+@register_acc_op
+def maximum(*, input, other):
+    return torch.maximum(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def min_full_reduce(*, input):
+    return torch.min(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def min_dim_reduce(*, input, dim=None, keepdim=False):
+    return torch.min(input, dim=dim, keepdim=keepdim)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.minimum))
+@register_acc_op_mapping(op_and_target=("call_method", "minimum"))
+@register_acc_op
+def minimum(*, input, other):
+    return torch.minimum(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.ne))
+@register_acc_op_mapping(op_and_target=("call_function", torch.ne))
+@register_acc_op_mapping(op_and_target=("call_method", "ne"))
+@register_acc_op
+def ne(*, input, other):
+    return operator.ne(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.eq))
+@register_acc_op_mapping(op_and_target=("call_function", torch.eq))
+@register_acc_op_mapping(op_and_target=("call_method", "eq"))
+@register_acc_op
+def eq(*, input, other):
+    return operator.eq(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.gt))
+@register_acc_op_mapping(op_and_target=("call_function", torch.gt))
+@register_acc_op_mapping(op_and_target=("call_method", "gt"))
+@register_acc_op
+def gt(*, input, other):
+    return operator.gt(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.lt))
+@register_acc_op_mapping(op_and_target=("call_function", torch.lt))
+@register_acc_op_mapping(op_and_target=("call_method", "lt"))
+@register_acc_op
+def lt(*, input, other):
+    return operator.lt(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.and_))
+@register_acc_op_mapping(op_and_target=("call_method", "bitwise_and"))
+@register_acc_op_mapping(op_and_target=("call_function", torch.bitwise_and))
+def bitwise_and(*, input, other):
+    return operator.and_(input, other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_and))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_and"))
+@register_acc_op
+def logical_and(*, input, other):
+    return torch.logical_and(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.or_))
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_or))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_or"))
+@register_acc_op
+def logical_or(*, input, other):
+    return torch.logical_or(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_not))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_not"))
+@register_acc_op
+def logical_not(*, input):
+    return torch.logical_not(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", operator.xor))
+@register_acc_op_mapping(op_and_target=("call_function", torch.logical_xor))
+@register_acc_op_mapping(op_and_target=("call_method", "logical_xor"))
+@register_acc_op
+def logical_xor(*, input, other):
+    return torch.logical_xor(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.isinf))
+@register_acc_op
+def isinf(*, input):
+    return torch.isinf(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def any(*, input, dim=None, keepdim=False):
+    if dim is not None:
+        return torch.any(input, dim, keepdim=keepdim)
+    else:
+        return torch.any(input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.any),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "any"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim", this_arg_is_optional),
+        ("keepdim", "keepdim", this_arg_is_optional),
+    ],
+)
+def any_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(any, kwargs=node.kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise)
+@register_acc_op_mapping(op_and_target=("call_function", torch.fmod))
+@register_acc_op_mapping(op_and_target=("call_method", "fmod"))
+@register_acc_op
+def fmod(*, input, other):
+    return torch.fmod(input=input, other=other)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sigmoid))
+@register_acc_op_mapping(op_and_target=("call_method", "sigmoid"))
+@register_acc_op
+def sigmoid(*, input):
+    return torch.sigmoid(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sinh))
+@register_acc_op
+def sinh(*, input):
+    return torch.sinh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cosh))
+@register_acc_op
+def cosh(*, input):
+    return torch.cosh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.tanh))
+@register_acc_op_mapping(op_and_target=("call_method", "tanh"))
+@register_acc_op
+def tanh(*, input):
+    return torch.tanh(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.asin))
+@register_acc_op
+def asin(*, input):
+    return torch.asin(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.acos))
+@register_acc_op
+def acos(*, input):
+    return torch.acos(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.atan))
+@register_acc_op
+def atan(*, input):
+    return torch.atan(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.exp))
+@register_acc_op
+def exp(*, input):
+    return torch.exp(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.log))
+@register_acc_op
+def log(*, input):
+    return torch.log(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sqrt))
+@register_acc_op_mapping(op_and_target=("call_method", "sqrt"))
+@register_acc_op
+def sqrt(*, input):
+    return torch.sqrt(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.reciprocal))
+@register_acc_op
+def reciprocal(*, input):
+    return torch.reciprocal(input=input)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "rsqrt"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.rsqrt),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def rsqrt_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": input_node,
+        }
+        sqrt_node = node.graph.call_function(sqrt, kwargs=new_kwargs)
+        sqrt_node.meta["type"] = torch.Tensor
+        new_kwargs = {
+            "input": sqrt_node,
+        }
+        rec_node = node.graph.call_function(reciprocal, kwargs=new_kwargs)
+        rec_node.meta["type"] = torch.Tensor
+        output_node = rec_node
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.abs))
+@register_acc_op
+def abs(*, input):
+    return torch.abs(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", operator.neg))
+@register_acc_op_mapping(op_and_target=("call_function", torch.neg))
+@register_acc_op
+def neg(*, input):
+    if not isinstance(input, torch.Tensor):
+        return operator.neg(input)
+    else:
+        return torch.neg(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.floor))
+@register_acc_op
+def floor(*, input):
+    return torch.floor(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.ceil))
+@register_acc_op
+def ceil(*, input):
+    return torch.ceil(input=input)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.pad))
+@register_acc_op
+def pad(*, input, pad: List[int], mode: str, value: float):
+    return torch.nn.functional.pad(input=input, pad=pad, mode=mode, value=value)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv1d))
+@register_acc_op
+def conv1d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv1d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv2d))
+@register_acc_op
+def conv2d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op
+def quantized_conv2d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    padding_mode,
+    acc_out_ty=None,
+):
+    qparams = acc_out_ty.qparams
+    return torch.nn.quantized.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        padding_mode=padding_mode,
+        scale=qparams["scale"],
+        zero_point=qparams["zero_point"],
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.conv3d))
+@register_acc_op
+def conv3d(*, input, weight, bias, stride, padding, dilation, groups):
+    return nn.functional.conv3d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.conv_transpose2d)
+)
+@register_acc_op
+def conv_transpose2d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return nn.functional.conv_transpose2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.conv_transpose3d)
+)
+@register_acc_op
+def conv_transpose3d(
+    *,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return nn.functional.conv_transpose3d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.batch_norm))
+@register_acc_op
+def batch_norm(
+    *,
+    input,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    training,
+    momentum,
+    eps,
+):
+    return nn.functional.batch_norm(
+        input=input,
+        running_mean=running_mean,
+        running_var=running_var,
+        weight=weight,
+        bias=bias,
+        training=training,
+        momentum=momentum,
+        eps=eps,
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.layer_norm))
+@register_acc_op
+def layer_norm(*, input, normalized_shape, weight, bias, eps):
+    return nn.functional.layer_norm(
+        input=input,
+        normalized_shape=normalized_shape,
+        weight=weight,
+        bias=bias,
+        eps=eps,
+    )
+
+
+def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node:
+    """
+    Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk
+    + acc_ops.getitem + acc_ops.squeeze (depends on keepdim).
+    """
+    input_node = node.kwargs["input"]
+    dim = node.kwargs["dim"]
+    keepdim = node.kwargs["keepdim"]
+
+    if dim is None and keepdim:
+        raise RuntimeError(
+            "We currently don't support argmin/argmax with dim=None and keepdim=True"
+        )
+
+    with node.graph.inserting_before(node):
+        if dim is None:
+            flatten_kwargs = {
+                "input": node.kwargs["input"],
+                "start_dim": 0,
+                "end_dim": -1,
+            }
+            flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs)
+            flatten_node.meta["type"] = torch.Tensor
+            input_node = flatten_node
+            dim = -1
+
+        topk_kwargs = {
+            "input": input_node,
+            "k": 1,
+            "dim": dim,
+            "largest": largest,
+            "sorted": False,
+        }
+        topk_node = node.graph.call_function(topk, kwargs=topk_kwargs)
+        # It's actually more like NamedTuple but tuple here should be fine.
+        topk_node.meta["type"] = tuple
+
+        getitem_kwargs = {"input": topk_node, "idx": 1}
+        getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs)
+        getitem_node.meta["type"] = torch.Tensor
+        output_node = getitem_node
+
+        if not keepdim:
+            squeeze_kwargs = {"input": getitem_node, "dim": dim}
+            output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs)
+
+        output_node.meta = node.meta.copy()
+        return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.argmin),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("keepdim", "keepdim"),
+    ],
+)
+def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    """
+    Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem
+    + acc_ops.squeeze (depends on keepdim).
+    """
+    return argmin_max_mapper_impl(node, largest=False)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.linalg.norm))
+@register_acc_op
+def linalg_norm(*, input, ord, dim, keepdim):
+    return torch.linalg.norm(input=input, ord=ord, dim=dim, keepdim=keepdim)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.functional.norm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("p", "p"),
+        ("dim", "dim"),
+        ("keepdim", "keepdim"),
+    ],
+)
+def norm_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    input_node = node.kwargs["input"]
+    p = node.kwargs["p"]
+    dim = node.kwargs["dim"]
+    keepdim = node.kwargs["keepdim"]
+    output_node = None
+    with node.graph.inserting_before(node):
+        if dim is None and p == 1:
+            # linalg_norm takes the max along the sum along a dim
+            # rather than the entire sum for p = 1
+            abs_node = node.graph.call_function(abs, kwargs={"input": input_node})
+            output_node = node.graph.call_function(
+                sum,
+                kwargs={"input": abs_node},
+            )
+        elif dim is None:
+            raise RuntimeError("dim=None has not been implemented for p != 1")
+        else:
+            output_node = node.graph.call_function(
+                linalg_norm,
+                kwargs={"input": input_node, "ord": p, "dim": dim, "keepdim": keepdim},
+            )
+
+    return output_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "split"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "split_with_sizes"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_sizes", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.split),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim"),
+    ],
+)
+def torch_split_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    """
+    If split_size_or_sections is sections, map the node to slice_tensors
+    + tuple_construct. Otherwise, if split_size_or_sections is split_size,
+    map the node to acc_ops.split.
+    """
+    split_size_or_sections = node.kwargs["split_size_or_sections"]
+    with node.graph.inserting_before(node):
+        if isinstance(split_size_or_sections, int):
+            new_kwargs = {
+                "input": node.kwargs["input"],
+                "split_size": split_size_or_sections,
+                "dim": node.kwargs["dim"],
+            }
+            new_node = node.graph.call_function(split, kwargs=new_kwargs)
+            new_node.meta = node.meta.copy()
+            return new_node
+
+        assert isinstance(split_size_or_sections, Sequence)
+        start = 0
+        slice_nodes = []
+        for i in split_size_or_sections:
+            assert isinstance(i, int)
+            new_kwargs = {
+                "input": node.kwargs["input"],
+                "dim": node.kwargs["dim"],
+                "start": start,
+                "stop": start + i,
+                "step": 1,
+            }
+            new_node = node.graph.call_function(slice_tensor, kwargs=new_kwargs)
+            new_node.meta["type"] = torch.Tensor
+            slice_nodes.append(new_node)
+            start += i
+
+        new_node = node.graph.call_function(
+            tuple_construct, kwargs={"tensors": tuple(slice_nodes)}
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def split(*, input, split_size, dim):
+    return torch.split(input, split_size, dim)
+
+
+@register_acc_op
+def tuple_construct(*, tensors):
+    return tuple(tensors)
+
+
+@register_acc_op_properties(AccOpProperty.quantized)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.ops.quantized.batch_norm2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("weight", "weight"),
+        ("bias", "bias"),
+        ("running_mean", "running_mean"),
+        ("running_var", "running_var"),
+        ("eps", "eps"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[
+        ("scale", "scale", move_to_qparams),
+        ("zero_point", "zero_point", move_to_qparams),
+    ],
+)
+@register_acc_op
+def quantized_batch_norm2d(
+    *,
+    input,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    eps,
+    acc_out_ty=None,
+):
+    qparams = acc_out_ty.qparams
+    return torch.ops.quantized.batch_norm2d(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        eps,
+        qparams["scale"],
+        qparams["zero_point"],
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding_bag))
+@register_acc_op
+def embedding_bag(
+    *,
+    input,
+    weight,
+    offsets,
+    max_norm,
+    norm_type,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return nn.functional.embedding_bag(
+        input=input,
+        weight=weight,
+        offsets=offsets,
+        max_norm=max_norm,
+        norm_type=norm_type,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        sparse=sparse,
+        per_sample_weights=per_sample_weights,
+        include_last_offset=include_last_offset,
+        padding_idx=padding_idx,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=(
+        "call_function",
+        torch.ops.quantized.embedding_bag_byte_rowwise_offsets,
+    )
+)
+@register_acc_op
+def embedding_bag_byte_rowwise_offsets(
+    *,
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    pruned_weights,
+    per_sample_weights,
+    compressed_indices_mapping,
+    include_last_offset,
+):
+    return torch.ops.quantized.embedding_bag_byte_rowwise_offsets(
+        weight=weight,
+        indices=indices,
+        offsets=offsets,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        pruned_weights=pruned_weights,
+        per_sample_weights=per_sample_weights,
+        compressed_indices_mapping=compressed_indices_mapping,
+        include_last_offset=include_last_offset,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=(
+        "call_function",
+        torch.ops.quantized.embedding_bag_4bit_rowwise_offsets,
+    )
+)
+@register_acc_op
+def embedding_bag_4bit_rowwise_offsets(
+    *,
+    weight,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    pruned_weights,
+    per_sample_weights,
+    compressed_indices_mapping,
+    include_last_offset,
+):
+    return torch.ops.quantized.embedding_bag_4bit_rowwise_offsets(
+        weight=weight,
+        indices=indices,
+        offsets=offsets,
+        scale_grad_by_freq=scale_grad_by_freq,
+        mode=mode,
+        pruned_weights=pruned_weights,
+        per_sample_weights=per_sample_weights,
+        compressed_indices_mapping=compressed_indices_mapping,
+        include_last_offset=include_last_offset,
+    )
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.sin))
+@register_acc_op_mapping(op_and_target=("call_method", "sin"))
+@register_acc_op
+def sin(*, input):
+    return torch.sin(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cos))
+@register_acc_op_mapping(op_and_target=("call_method", "cos"))
+@register_acc_op
+def cos(*, input):
+    return torch.cos(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.tan))
+@register_acc_op
+def tan(*, input):
+    return torch.tan(input=input)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.topk))
+@register_acc_op
+def topk(*, input, k, dim, largest, sorted):
+    return torch.topk(input=input, k=k, dim=dim, largest=largest, sorted=sorted)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", operator.getitem))
+@register_acc_op
+def getitem(*, input, idx):
+    return input[idx]
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.nan_to_num))
+@register_acc_op_mapping(op_and_target=("call_method", "nan_to_num"))
+@register_acc_op
+def nan_to_num(*, input, nan=0.0, posinf=None, neginf=None):
+    return torch.nan_to_num(input, nan=nan, posinf=posinf, neginf=neginf)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "expand"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "sizes"),
+    ],
+)
+@register_acc_op
+def expand(*, input, sizes):
+    return input.expand(*sizes)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.masked_fill),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mask", "mask"),
+        ("value", "value"),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "masked_fill"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("mask", "mask"),
+        ("value", "value"),
+    ],
+)
+@register_acc_op
+def masked_fill(*, input, mask, value):
+    return input.masked_fill(mask, value)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.where))
+@register_acc_op
+def where(*, condition, x, y):
+    return torch.where(condition, x, y)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op
+def slice_tensor(*, input, dim, start, stop, step):
+    slc = slice(start, stop, step)
+    if dim >= 0:
+        slices: List[slice] = [slice(None, None, None) for _ in range(dim)]
+        slices.append(slc)
+    else:
+        slices = [Ellipsis, slc]  # type: ignore[list-item]
+        slices.extend([slice(None, None, None) for _ in range(-dim - 1)])
+
+    return input[tuple(slices)]
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.narrow),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("start", "start"),
+        ("length", "length"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "narrow"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("start", "start"),
+        ("length", "length"),
+    ],
+)
+def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    assert isinstance(node.kwargs["start"], int) and isinstance(
+        node.kwargs["length"], int
+    )
+    kwargs = {
+        "input": node.kwargs["input"],
+        "dim": node.kwargs["dim"],
+        "start": node.kwargs["start"],
+        "stop": node.kwargs["start"] + node.kwargs["length"],
+        "step": 1,
+    }
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(slice_tensor, kwargs=kwargs)
+    new_node.meta = node.meta.copy()
+    return new_node
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.reshape),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("shape", "shape"),
+    ],
+    kwargs_to_move_to_acc_out_ty=[("shape", "shape")],
+)
+@register_acc_op
+def reshape(*, input, acc_out_ty=None):
+    assert acc_out_ty is not None
+    return input.reshape(acc_out_ty.shape)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "reshape"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "shape"),
+    ],
+)
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "view"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("*", "shape"),
+    ],
+)
+def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    For Tensor.reshape and Tensor.view nodes, args could be (input, 1, 2, 3) or (input,
+    (1, 2, 3)).  Here we do some special handling with the `shape` arg in order to map
+    it to acc_ops.reshape. It also handles the case when `shape` is a list instead of
+    tuple.
+    """
+    input_node = node.kwargs["input"]
+    shape = node.kwargs["shape"]
+
+    assert isinstance(shape, Sequence)
+    if isinstance(shape[0], (tuple, list)):  # type: ignore[index]
+        shape = shape[0]  # type: ignore[index]
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.call_function(
+            reshape,
+            kwargs={
+                "input": input_node,
+                "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=shape),
+            },
+        )
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "half"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_half_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.float16),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "int"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_int_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.int),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "float"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def custom_float_mapper(node: torch.fx.Node, _: nn.Module):
+    with node.graph.inserting_before(node):
+        new_kwargs = {
+            "input": node.kwargs["input"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=torch.float),
+        }
+        new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op
+def to_dtype(input, acc_out_ty=None, device=None):
+    assert acc_out_ty is not None
+    return input.to(dtype=acc_out_ty.dtype, device=device)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "to"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dtype", "dtype"),
+        ("device", "device", this_arg_is_optional),
+    ],
+)
+def custom_tensor_to_mapper(node: torch.fx.Node, _: nn.Module):
+    dest = node.kwargs["dtype"]
+    mem_format = node.kwargs.get("memory_format")
+    dest_other = node.kwargs.get("device")
+    assert dest is not None
+    assert mem_format is None or mem_format == torch.preserve_format
+
+    dest_dtype = dest_device = None
+    if isinstance(dest, torch.fx.node.Node):
+        meta_type = dest.meta["type"]
+        # consider the device is gpu only, meta info is limited to give clear device type
+        if dest.meta["type"] == torch.device:
+            dest_device = dest
+        elif dest.meta["type"] == torch.dtype:
+            dest_dtype = dest
+        elif dest.meta["type"] == torch.Tensor:
+            input_obj = node.kwargs["input"]
+            other_obj = dest
+            with node.graph.inserting_before(node):
+                dtype_node = node.graph.call_function(
+                    dtype, kwargs={"input": other_obj}
+                )
+                dtype_node.meta["type"] = torch.dtype
+                device_node = node.graph.call_function(
+                    device, kwargs={"input": other_obj}
+                )
+                device_node.meta["type"] = torch.device
+                new_kwargs = {
+                    "input": input_obj,
+                    "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dtype_node),
+                    "device": device_node,
+                }
+                new_node = node.graph.call_function(to_dtype, kwargs=new_kwargs)
+                new_node.meta = node.meta
+                return new_node
+        else:
+            raise RuntimeError(f"We currently do not support to({meta_type})")
+    elif isinstance(dest, torch.device):
+        # only device is set, dtype=None
+        if dest_other is None:
+            dest_device = dest
+        # device and dtype are both set
+        else:
+            dest_dtype = dest_other
+            dest_device = dest
+    # only dtype is set
+    else:
+        dest_dtype = dest
+
+    new_kwargs = {
+        "input": node.kwargs["input"],
+        "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dest_dtype),
+        "device": dest_device,
+    }
+
+    with node.graph.inserting_before(node):
+        new_node = node.graph.create_node(
+            "call_function", to_dtype, kwargs=new_kwargs, name=node.name
+        )
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.add),
+    # Note that we may have aliases for inputs here due to issues with deterministically
+    # knowing the correct target that will be resolved by pytorch.
+    arg_replacement_tuples=[
+        (("input", "a"), "input"),
+        (("other", "b"), "other"),
+        ("alpha", "alpha", this_arg_is_optional),
+    ],
+)
+def custom_torch_add_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node:
+    """
+    Add custom mapping for torch.add because it has an `alpha` parameter which scales
+    the `other` input, and we want to make that mul a separate node.
+    """
+    with node.graph.inserting_before(node):
+        # If alpha is in kwargs check if we need to add a mul, and use correct kwargs.
+        if "alpha" in node.kwargs:
+            # Add mul node only if it has a numerical impact, i.e. alpha != 1.0.
+            if node.kwargs["alpha"] != 1.0:
+                other_node = node.graph.create_node(
+                    "call_function",
+                    mul,
+                    kwargs={
+                        "input": node.kwargs["other"],
+                        "other": node.kwargs["alpha"],
+                    },
+                    name=node.name + "_mul_alpha",
+                )
+                other_node.meta = node.meta
+            else:
+                other_node = node.kwargs["other"]
+            add_kwargs = {"input": node.kwargs["input"], "other": other_node}
+        else:
+            add_kwargs = node.kwargs
+
+        new_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=node.name
+        )
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.quantized.Linear),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_linear_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantized_linear module to acc_op.linear. We unpack weight and bias
+    in this mapper and pass them directly to linear node.
+    """
+    assert isinstance(node.target, str)
+    linear_module = dict(mod.named_modules())[node.target]
+    prefix = node.target.replace(".", "_")
+    weight_name = f"{prefix}_weight"
+    bias_name = f"{prefix}_bias"
+
+    # Store weight and bias in the main module
+    mod.register_buffer(weight_name, linear_module.weight())
+    if linear_module.bias() is not None:
+        mod.register_buffer(bias_name, linear_module.bias())
+
+    with node.graph.inserting_before(node):
+        # Insert get_attr nodes for weight and bias
+        get_weight = node.graph.get_attr(weight_name)
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(
+            linear_module.weight()
+        )
+
+        get_bias = None
+        if linear_module.bias() is not None:
+            get_bias = node.graph.get_attr(bias_name)
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(
+                linear_module.bias()
+            )
+
+        qparams = {"scale": linear_module.scale, "zero_point": linear_module.zero_point}
+        # Create kwargs for acc_op.quantized_linear
+        kwargs = {
+            "input": node.kwargs["input"],
+            "weight": get_weight,
+            "bias": get_bias,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+
+        new_node = node.graph.call_function(quantized_linear, kwargs=kwargs)
+        new_node.meta = node.meta.copy()
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.quantized.Conv2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_conv2d_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantzed Conv2d module to acc_op.conv. We unpack all the parameters
+    in this mapper and pass them directly to conv2d node.
+    """
+    assert isinstance(node.target, str)
+    conv_module = dict(mod.named_modules())[node.target]
+    prefix = node.target.replace(".", "_")
+    weight_name = f"{prefix}_weight"
+    bias_name = f"{prefix}_bias"
+
+    # Store weight and bias in the main module
+    mod.register_buffer(weight_name, conv_module.weight())
+    if conv_module.bias() is not None:
+        mod.register_buffer(bias_name, conv_module.bias())
+
+    with node.graph.inserting_before(node):
+        # Insert get_attr nodes for weight and bias
+        get_weight = node.graph.get_attr(weight_name)
+        get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight())
+
+        get_bias = None
+        if conv_module.bias() is not None:
+            get_bias = node.graph.get_attr(bias_name)
+            get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias())
+
+        qparams = {"scale": conv_module.scale, "zero_point": conv_module.zero_point}
+
+        # Create kwargs for acc_op.conv
+        kwargs = {
+            "input": node.kwargs["input"],
+            "weight": get_weight,
+            "bias": get_bias,
+            "stride": conv_module.stride,
+            "padding": conv_module.padding,
+            "dilation": conv_module.dilation,
+            "groups": conv_module.groups,
+            "padding_mode": conv_module.padding_mode,
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+
+        new_node = node.graph.call_function(quantized_conv2d, kwargs=kwargs)
+        new_node.meta = node.meta
+        return new_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.ops.quantized.add_relu),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+        ("scale", "scale"),
+        ("zero_point", "zero_point"),
+    ],
+)
+def add_relu_unfuse_mapper(
+    node: torch.fx.Node, mod: torch.fx.GraphModule
+) -> torch.fx.Node:
+    with node.graph.inserting_before(node):
+        qparams = {
+            "scale": node.kwargs["scale"],
+            "zero_point": node.kwargs["zero_point"],
+        }
+        add_kwargs = {
+            "input": node.kwargs["input"],
+            "other": node.kwargs["other"],
+            "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams),
+        }
+        add_node = node.graph.call_function(quantized_add, kwargs=add_kwargs)
+        add_node.meta = node.meta.copy()
+
+        relu_node = node.graph.call_function(
+            relu, kwargs={"input": add_node, "inplace": False}
+        )
+        relu_node.meta = node.meta
+        return relu_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", nn.intrinsic.quantized.ConvReLU2d),
+    arg_replacement_tuples=[
+        ("input", "input"),
+    ],
+)
+def packed_quantized_convrelu2d_mapper(
+    node: torch.fx.Node, mod: nn.Module
+) -> torch.fx.Node:
+    """
+    Mapping from quantized ConvReLU2d module to acc_op.relu. We use packed_quantized_conv2d_mapper to unpack all the parameters
+    in this mapper and pass the returned conv2d node directly to relu node.
+    """
+
+    with node.graph.inserting_before(node):
+        # conv2d op
+        conv2d_node = packed_quantized_conv2d_mapper(node, mod)
+
+        # relu op
+        relu_node = node.graph.call_function(
+            relu, kwargs={"input": conv2d_node, "inplace": False}
+        )
+        relu_node.meta = node.meta
+        return relu_node
+
+
+@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.gelu))
+@register_acc_op_mapping(op_and_target=("call_method", "gelu"))
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_module", torch.nn.GELU),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("approximate", "approximate"),
+    ],
+)
+@register_acc_op
+def gelu(*, input, approximate="none"):
+    return torch.nn.functional.gelu(input=input, approximate=approximate)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.cumsum))
+@register_acc_op_mapping(op_and_target=("call_method", "cumsum"))
+@register_acc_op
+def cumsum(*, input, dim, dtype=None):
+    return torch.cumsum(input=input, dim=dim, dtype=dtype)
+
+
+@register_acc_op_properties(AccOpProperty.unary)
+@register_acc_op_mapping(op_and_target=("call_function", torch.chunk))
+@register_acc_op_mapping(op_and_target=("call_method", "chunk"))
+@register_acc_op
+def chunk(*, input, chunks, dim=0):
+    return torch.chunk(input=input, chunks=chunks, dim=dim)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.gather),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("index", "index"),
+        ("sparse_grad", "sparse_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def gather(*, input, dim, index, sparse_grad=False):
+    return torch.gather(input=input, dim=dim, index=index, sparse_grad=sparse_grad)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.index_select),
+)
+@register_acc_op
+def index_select(*, input, dim, index):
+    return torch.index_select(input, dim, index)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_method", "expand_as"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("other", "other"),
+    ],
+)
+def expand_as_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Maps expand_as(other) to expand(other.size())
+    """
+    with node.graph.inserting_before(node):
+        size_node = node.graph.call_function(
+            size, kwargs={"input": node.kwargs["other"]}
+        )
+        size_node.meta["type"] = torch.Size
+
+        expand_node = node.graph.call_function(
+            expand, kwargs={"input": node.kwargs["input"], "sizes": size_node}
+        )
+        expand_node.meta = node.meta.copy()
+        return expand_node
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.grid_sample),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("grid", "grid"),
+        ("mode", "mode", this_arg_is_optional),
+        ("padding_mode", "padding_mode", this_arg_is_optional),
+        ("align_corners", "align_corners", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def grid_sample(
+    *,
+    input,
+    grid,
+    mode="bilinear",
+    padding_mode="zeros",
+    align_corners=None,
+):
+    return torch.nn.functional.grid_sample(
+        input=input,
+        grid=grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.interpolate),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size", this_arg_is_optional),
+        ("scale_factor", "scale_factor", this_arg_is_optional),
+        ("mode", "mode", this_arg_is_optional),
+        ("align_corners", "align_corners", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def interpolate(
+    *,
+    input,
+    size=None,
+    scale_factor=None,
+    mode="nearest",
+    align_corners=None,
+):
+    return torch.nn.functional.interpolate(
+        input=input,
+        size=size,
+        scale_factor=scale_factor,
+        mode=mode,
+        align_corners=align_corners,
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.tensor_split),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("tensor_indices_or_sections", "sections", "indices"), "indices_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "tensor_split"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        (("tensor_indices_or_sections", "sections", "indices"), "indices_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def tensor_split(*, input, indices_or_sections, dim=0):
+    # Need to de-coalesce the indices_or_sections because tensor_split accepts
+    # one of three kwarg signatures:
+    #  * (Tensor input, Tensor tensor_indices_or_sections, int dim)
+    #  * (Tensor input, int sections, int dim)
+    #  * (Tensor input, tuple of ints indices, int dim)
+    if isinstance(indices_or_sections, torch.Tensor):
+        indices_or_sections = indices_or_sections.tolist()
+    if isinstance(indices_or_sections, int):
+        return torch.tensor_split(input, sections=indices_or_sections, dim=dim)
+    elif isinstance(indices_or_sections, Iterable):
+        return torch.tensor_split(input, indices=tuple(indices_or_sections), dim=dim)
+    else:
+        raise RuntimeError(
+            f"Expected int, Iterable or Tensor for "
+            f"indices_or_sections arg, got: {type(indices_or_sections)}"
+        )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_empty"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_empty(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_empty(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.einsum),
+    arg_replacement_tuples=[
+        ("equation", "equation"),
+        ("*", "operands"),
+    ],
+)
+@register_acc_op
+def einsum(*, equation, operands):
+    return torch.einsum(equation, *operands)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.as_strided),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("stride", "stride"),
+        ("storage_offset", "storage_offset", this_arg_is_optional),
+    ],
+)
+@register_acc_op_mapping(
+    op_and_target=("call_method", "as_strided"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("stride", "stride"),
+        ("storage_offset", "storage_offset", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def as_strided(*, input, size, stride, storage_offset=0):
+    return torch.as_strided(
+        input=input, size=size, stride=stride, storage_offset=storage_offset
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.var))
+@register_acc_op_mapping(
+    op_and_target=("call_method", "var"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("unbiased", "unbiased"),
+        ("keepdim", "keepdim"),
+    ],
+)
+@register_acc_op
+def var(*, input, dim, unbiased, keepdim=False):
+    return torch.var(input=input, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+
+@register_acc_op
+def xl_weight(weight_id: str, metadata: TensorMetadata, proxy_shape, dtype):
+    """
+    This op stores metadata and weight_id and otherwise returns a zeros tensor
+    with shape `proxy_shape` and dtype `dtype`.
+
+    Note: when Nodes with this op are run through ShapeProp, its metadata will
+    be the same as computed and set as of that of `proxy`, however when running
+    acc_shape_inference, it will return `metadata`.
+
+    Args:
+        weight_id: string identifier for the XL weight
+        metadata: metadata of the XL weight
+        proxy_shape: shape of substitute tensor
+        dtype: dtype of substitute tensor
+    """
+    return torch.zeros(proxy_shape, dtype=dtype)
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.nn.functional.log_softmax),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("dim", "dim"),
+        ("dtype", "dtype"),
+    ],
+)
+def log_softmax_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    with node.graph.inserting_after(node):
+        softmax_kwargs = {
+            "input": node.kwargs["input"],
+            "dim": node.kwargs["dim"],
+            "dtype": node.kwargs["dtype"],
+        }
+        softmax_node = node.graph.call_function(softmax, kwargs=softmax_kwargs)
+        softmax_node.meta = node.meta.copy()
+
+    with softmax_node.graph.inserting_after(softmax_node):
+        log_kwargs = {"input": softmax_node}
+        log_node = node.graph.call_function(log, kwargs=log_kwargs)
+        log_node.meta = node.meta.copy()
+
+        return log_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.nn.functional.softplus),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("beta", "beta", this_arg_is_optional),
+        ("threshold", "threshold", this_arg_is_optional),
+    ],
+)
+def softplus_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node:
+    """
+    Maps torch.nn.functional.softplus to acc_ops.where, acc_ops.relu, acc_ops.exp, acc_ops.mul, acc_ops.add and acc_ops.div
+
+    softplus(input, beta, threshold) = where(beta * input > threshold, relu(input), div(log(1 + exp(beta * input))), beta))
+
+    torch.where(
+        softplus_module.beta * sample_inputs[0] > softplus_module.threshold,
+        sample_inputs[0].relu(),
+        torch.div((1 + (softplus_module.beta * sample_inputs[0]).exp()).log(), softplus_module.beta),
+    )
+
+    """
+
+    input_node = node.kwargs["input"]
+    beta_node = node.kwargs["beta"]
+    threshold_node = node.kwargs["threshold"]
+
+    with node.graph.inserting_after(node):
+        cond_mul_node = node.graph.call_function(
+            mul,
+            kwargs={
+                "input": input_node,
+                "other": beta_node,
+            },
+        )
+        cond_mul_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(cond_mul_node):
+        gt_node = node.graph.call_function(
+            gt,
+            kwargs={
+                "input": cond_mul_node,
+                "other": threshold_node,
+            },
+        )
+        gt_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(gt_node):
+        relu_node = node.graph.call_function(relu, kwargs={"input": input_node})
+        relu_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(relu_node):
+        mul_node = node.graph.call_function(
+            mul,
+            kwargs={
+                "input": input_node,
+                "other": beta_node,
+            },
+        )
+        mul_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(mul_node):
+        exp_node = node.graph.call_function(exp, kwargs={"input": mul_node})
+        exp_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(exp_node):
+        add_node = node.graph.call_function(
+            add,
+            kwargs={
+                "input": exp_node,
+                "other": 1,
+            },
+        )
+        add_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(add_node):
+        log_node = node.graph.call_function(log, kwargs={"input": add_node})
+        log_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(log_node):
+        div_node = node.graph.call_function(
+            div,
+            kwargs={
+                "input": log_node,
+                "other": beta_node,
+            },
+        )
+        div_node.meta = input_node.meta.copy()
+
+    with node.graph.inserting_after(div_node):
+        where_node = node.graph.call_function(
+            where,
+            kwargs={
+                "condition": gt_node,
+                "x": relu_node,
+                "y": div_node,
+            },
+        )
+        where_node.meta = div_node.meta.copy()
+
+        return where_node
+
+
+@register_custom_acc_mapper_fn(
+    op_and_target=("call_function", torch.baddbmm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("batch1", "batch1"),
+        ("batch2", "batch2"),
+        ("beta", "beta", this_arg_is_optional),
+        ("alpha", "alpha", this_arg_is_optional),
+    ],
+)
+def baddbmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node:
+    """
+    Mapping from torch.baddbmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1
+    then we also insert acc_ops.mul to the right place.
+    """
+    with node.graph.inserting_before(node):
+        mm_kwargs = {"input": node.kwargs["batch1"], "other": node.kwargs["batch2"]}
+        mm_node = node.graph.create_node(
+            "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_matmul"
+        )
+        mm_node.meta = node.meta.copy()
+
+        if node.kwargs["alpha"] != 1:
+            mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]}
+            mm_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul"
+            )
+        mm_node.meta = node.meta.copy()
+
+        input_node = node.kwargs["input"]
+        if node.kwargs["beta"] != 1:
+            mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]}
+            new_input_node = node.graph.create_node(
+                "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul"
+            )
+            assert isinstance(input_node, torch.fx.Node)
+            new_input_node.meta = input_node.meta.copy()
+            input_node = new_input_node
+
+        add_kwargs = {"input": input_node, "other": mm_node}
+        add_node = node.graph.create_node(
+            "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add"
+        )
+        add_node.meta = node.meta.copy()
+        return add_node
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.clone))
+@register_acc_op_mapping(op_and_target=("call_method", "clone"))
+@register_acc_op
+def clone(*, input):
+    return torch.clone(input)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.unbind))
+@register_acc_op
+def unbind(*, input, dim=0):
+    return torch.unbind(input, dim=dim)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_function", torch.nn.functional.group_norm),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("num_groups", "num_groups"),
+        ("weight", "weight"),
+        ("bias", "bias"),
+        ("eps", "eps"),
+    ],
+)
+@register_acc_op
+def group_norm(*, input, num_groups, weight=None, bias=None, eps=1e-05):
+    return torch.nn.functional.group_norm(
+        input, num_groups, weight=weight, bias=bias, eps=eps
+    )
+
+
+@register_acc_op_mapping(op_and_target=("call_method", "long"))
+@register_acc_op
+def long(*, input):
+    return input.long()
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_full"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("fill_value", "fill_value"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_full(*, input, size, fill_value, dtype=None, device=None, requires_grad=False):
+    return input.new_full(size, fill_value=fill_value, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.full_like))
+@register_acc_op
+def full_like(*, input, fill_value, dtype=None, device=None):
+    return torch.full_like(
+        input=input, fill_value=fill_value, dtype=dtype, device=device
+    )
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_ones"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_ones(*, input, size, dtype=None, device=None, requires_grad=False):
+    assert requires_grad is False, f"requires_grad != False, it is {requires_grad}"
+    return input.new_ones(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.ones_like))
+@register_acc_op
+def ones_like(*, input, dtype=None, device=None):
+    return torch.ones_like(input=input, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(
+    op_and_target=("call_method", "new_zeros"),
+    arg_replacement_tuples=[
+        ("input", "input"),
+        ("size", "size"),
+        ("dtype", "dtype", this_arg_is_optional),
+        ("device", "device", this_arg_is_optional),
+        ("requires_grad", "requires_grad", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def new_zeros(*, input, size, dtype=None, device=None, requires_grad=False):
+    return input.new_zeros(size, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.zeros_like))
+@register_acc_op
+def zeros_like(*, input, dtype=None, device=None):
+    return torch.zeros_like(input=input, dtype=dtype, device=device)
+
+
+@register_acc_op_mapping(op_and_target=("call_function", torch.masked_select))
+@register_acc_op
+def masked_select(*, input, mask):
+    return torch.masked_select(input=input, mask=mask)
+
+
+###############################################################################
+
+# Set ops as side-effectul, this prevents them from being optimized away or
+# being folded into constants.
+torch.fx.node._side_effectful_functions.add(xl_weight)
diff --git a/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
new file mode 100644
index 000000000..21fbd8415
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_shape_prop.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import sys
+from typing import Any
+
+import torch.fx
+from torch.fx.passes import shape_prop
+
+from . import acc_ops
+
+
+class SuppressStderrPrints:
+    def __enter__(self):
+        self._original_stderr = sys.stderr
+        sys.stderr = open(os.devnull, "w")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stderr.close()
+        sys.stderr = self._original_stderr
+
+
+class AccShapeProp(shape_prop.ShapeProp):
+    """
+    Similar to standard shape prop, but if any node that is run with standard shape prop
+    fails then it tries to upconvert any fp16 inputs to fp32, rerun shape prop, and then
+    downconvert fp32 results back to fp16.
+
+    Note that we currently mostly only look for/support up/down conversion for nodes
+    with tensor outputs, but this is likely fine for most cases. Additionally the base
+    shape_prop works for many ops with fp16, such as tensor.cat, tensor slice, tensor.to
+    dtype conversion, etc.
+
+    """
+
+    def _run_node(self, n: torch.fx.Node) -> Any:
+        # Run ops with XL weights by clamping their inputs, see
+        # docstring for self.run_node_with_xl_weights for more details
+        if any(
+            isinstance(kwarg, torch.fx.Node) and kwarg.target == acc_ops.xl_weight
+            for kwarg in n.kwargs.values()
+        ):
+            return self.run_node_with_xl_weights(n)
+        else:
+            return super().run_node(n)
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        # First try running shape_prop with the original inputs.
+        with SuppressStderrPrints():
+            try:
+                return self._run_node(n)
+            except Exception:
+                pass
+
+        # Base shape_prop failed, so temporarily upconvert the node's fp16 inputs in env
+        # and retry. For now just support upconverting Tensor outputs.
+        orig_dtype_env = []
+        for in_node in n.all_input_nodes:
+            in_ten = self.env[in_node]
+            if isinstance(in_ten, torch.Tensor) and in_ten.dtype == torch.float16:
+                orig_dtype_env.append((in_node, in_ten))
+                self.env[in_node] = in_ten.clone().to(dtype=torch.float)
+
+        # Now try running again with upconverted fp32 input tensor in env.
+        result = self._run_node(n)
+
+        # Now that we succeeded, assume it's thanks to upconverting. Therefore we
+        # downconvert fp32 tensor results to fp16.
+        if isinstance(result, torch.Tensor) and result.dtype == torch.float:
+            result = result.to(dtype=torch.float16)
+            self.env[n] = result
+            n.meta["tensor_meta"] = n.meta["tensor_meta"]._replace(dtype=torch.float16)
+
+        # Finally, restore the original env back to fp16 for any upconverted tensors.
+        for in_node, in_ten in orig_dtype_env:
+            self.env[in_node] = in_ten
+
+        return result
+
+    def run_node_with_xl_weights(self, n: torch.fx.Node) -> Any:
+        """
+        EmbeddingBag with XL Weights of shape (num_embeddings, embedding_dim)
+        are replaced with smaller proxies of shape
+        (acc_ops.PROXY_EMBEDDING_SIZE, embedding_dim) during tracing. This can
+        cause index out of bounds issues when sample inputs lead to the
+        embedding bag op indexing into the first dimension of the weight tensor
+        which it expects to be bigger than it is during tracing.
+
+        For these ops, return a zeros tensor of the correct shape and dtype.
+        """
+
+        op = n.target.__module__ + "." + n.target.__name__
+
+        if op.endswith("acc_ops.int_nbit_split_embedding_codegen_lookup_function"):
+            output_dtype_int = n.kwargs["output_dtype"]
+            assert output_dtype_int < 2, "only support float16 and float32"
+            output_dtype = torch.float if output_dtype_int == 0 else torch.float16
+            total_D = n.kwargs["total_D"]
+
+            D_offsets_shape = self.env[n.kwargs["D_offsets"]].shape
+            offsets_shape = self.env[n.kwargs["offsets"]].shape
+            batches = (offsets_shape[0] - 1) // (D_offsets_shape[0] - 1)
+            result = torch.zeros((batches, total_D), dtype=output_dtype)
+
+        elif op.find("acc_ops.embedding_bag"):
+            weight = self.env[n.kwargs["weight"]]
+            offsets_shape = self.env[n.kwargs["offsets"]].shape
+            batches = offsets_shape[0] - int(n.kwargs["include_last_offset"])
+            output_dtype = weight.dtype
+
+            embedding_size = weight.shape[1]
+            if op.endswith("acc_ops.embedding_bag_byte_rowwise_offsets"):
+                embedding_size -= 8
+                output_dtype = torch.float32
+            elif op.endswith("acc_ops.embedding_bag_4bit_rowwise_offsets"):
+                embedding_size = (embedding_size - 4) * 2
+                output_dtype = torch.float32
+
+            result = torch.zeros((batches, embedding_size), dtype=output_dtype)
+
+        else:
+            raise NotImplementedError(
+                f"The op {op} cannot be run with xl_weight(s) inputs"
+            )
+
+        return result
diff --git a/fx2ait/fx2ait/acc_tracer/acc_tracer.py b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
new file mode 100644
index 000000000..b5899f89b
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_tracer.py
@@ -0,0 +1,639 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import ast
+import builtins
+import copy
+import inspect
+import logging
+import textwrap
+import warnings
+from types import FunctionType
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.jit as jit
+import torch.nn as nn
+from torch._sources import normalize_source_lines
+from torch.fx import Graph, Tracer
+from torch.fx.experimental.normalize import NormalizeArgs
+from torch.fx.node import Argument, Node, Target
+
+from . import acc_normalizer, acc_ops, acc_shape_prop, acc_utils  # noqa: F401
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def _get_exception_wrapper_attr_name(exc_type: Type[Exception]) -> str:
+    return f"_conditional_exception_wrapper_{exc_type.__name__}"
+
+
+class Acc_Rewriter(ast.NodeTransformer):
+    """
+    Take a FunctionType object representing a `forward` method, then
+    perform an AST rewrite to swap out nodes that are not symbolically
+    traceable with a callsite to the FX alternative.
+
+    To support swapping out an AST node, define a new `visit` method on
+    that node. For more details, see:
+    https://docs.python.org/3/library/ast.html#ast.NodeTransformer
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.exceptions_rewritten: Set[Type[Exception]] = set()
+        self.exceptions_bool_rewritten: Set[Type[Exception]] = set()
+
+    def rewrite(
+        self, fn: FunctionType
+    ) -> Tuple[FunctionType, Set[Type[Exception]], Set[Type[Exception]]]:
+        # Normalize the source lines
+        sourcelines, _ = inspect.getsourcelines(fn)
+        sourcelines = normalize_source_lines(sourcelines)
+        source = "".join(sourcelines)
+        normalized_str = textwrap.dedent(source)
+
+        # Rewrite the original AST
+        source_ast = ast.parse(normalized_str)
+        dest_ast = ast.fix_missing_locations(self.visit(source_ast))
+
+        # Pull out the compiled function from the newly-created Module
+        code = compile(dest_ast, "", "exec")
+        globals_dict = copy.copy(fn.__globals__)
+        keys_before = set(globals_dict.keys())
+        exec(code, globals_dict)  # noqa P204
+        new_keys = list(set(globals_dict.keys()) - keys_before)
+        assert len(new_keys) <= 1
+        fn_compiled = globals_dict[fn.__name__]
+
+        # Return the correct FunctionType object and the Exceptions that were
+        # rewritten during visit_If.
+        return fn_compiled, self.exceptions_rewritten, self.exceptions_bool_rewritten
+
+    def visit_Assert(self, node: ast.Assert):
+        """
+        Swap out the Assert node (Python's `assert`) with a callsite to the
+        symbolically-traceable torch._assert function
+        """
+        # Create the Call node
+        n = ast.parse("torch._assert()", mode="eval")
+        assert isinstance(n, ast.Expression)
+        call_node = n.body
+        assert isinstance(call_node, ast.Call)
+        msg = node.msg if node.msg else ast.Constant(value="", kind=None)
+        call_node.args = [node.test, msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = ast.Expr(value=call_node)
+
+        # Return the new Call node to signify that we want to use it as
+        # a replacement for the original _assert node
+        return ast.copy_location(expr_wrapper, node)
+
+    def visit_If(self, if_node: ast.If):
+        """
+        Swap out the pattern `If(x): Raise(y)` with a ConditionalExceptionWrapper
+        specialized for the specific exception y. The specialized
+        ConditionalExceptionWrapper module will be added in the RewrittenModule.
+        Only works with builtin Exceptions, as we assume the signature of the
+        init for the Exception is a string.
+        """
+        raise_node = if_node.body[0]
+        if not isinstance(raise_node, ast.Raise):
+            return if_node
+
+        # Don't handle orelse for now.
+        # TODO: Move orelse to the body after calling ConditionalExceptionWrapper.
+        if len(if_node.orelse) != 0:
+            return if_node
+
+        def _reuse_loc(node):
+            return ast.copy_location(node, if_node)
+
+        # If the exception has a message then we expect the raise's exc to be a
+        # Call w/ a msg. Else if it's a exc Name then there's no msg to use.
+        node_for_exc = raise_node.exc
+        if isinstance(node_for_exc, ast.Name):
+            # E.g. `raise AssertionError`, i.e. without an exc_msg.
+            name_node_of_exc = node_for_exc
+            exc_msg = _reuse_loc(ast.Constant(None))
+        elif isinstance(node_for_exc, ast.Call):
+            # E.g. `raise AssertionError("error message")`
+            name_node_of_exc = node_for_exc.func  # type: ignore[assignment]
+            if not isinstance(name_node_of_exc, ast.Name):
+                return if_node
+            # Most assertions just take a single string arg, but some may not; skip
+            # handling such assertions for now.
+            if len(node_for_exc.args) != 1:
+                return if_node
+            exc_msg = node_for_exc.args[0]
+        else:
+            return if_node
+
+        # Convert what we expect is the name of the exception into its
+        # associated python class.
+        name_of_exc = name_node_of_exc.id
+        try:
+            exc_type = eval(name_of_exc)  # noqa P204
+        except Exception:
+            return if_node
+
+        # Check that we actually have a builtin exception.
+        if (
+            not issubclass(exc_type, Exception)
+            or getattr(getattr(exc_type, "__class__", None), "__module__", None)
+            != "builtins"
+        ):
+            return if_node
+
+        # We need a ConditionalExceptionWrapper specialized for every kind of
+        # exception, so add it to exceptions_rewritten to remember for later to
+        # add a specialized attr with it.
+        self.exceptions_rewritten.add(exc_type)
+
+        # From here we definitely should be able to do the replacement. Create a
+        # Call node to the ConditionalExceptionWrapper module we're replacing
+        # the If with, with args set as the If's condition and the string of the
+        # exception. The call to the self._conditional_exception_wrapper_*Error
+        # module is safe because the RewrittenModule will add it as an attr
+        # based on the returned exceptions_rewritten, and we assume we are
+        # currently modifying the AST of a method from a RewrittenModule.
+        exc_wrapper_node = ast.parse(
+            f"self.{_get_exception_wrapper_attr_name(exc_type)}()", mode="eval"
+        )
+        assert isinstance(exc_wrapper_node, ast.Expression)
+        exc_wrapper_call_node = exc_wrapper_node.body
+        assert isinstance(exc_wrapper_call_node, ast.Call)
+        if isinstance(if_node.test, ast.BoolOp) and isinstance(
+            if_node.test.op, ast.And
+        ):
+            self.exceptions_bool_rewritten.add(exc_type)
+            bool_wrapper_node = ast.parse(
+                f"self.{_get_exception_wrapper_attr_name(exc_type)}_bool()", mode="eval"
+            )
+            assert isinstance(exc_wrapper_node, ast.Expression)
+            bool_wrapper_call_node = bool_wrapper_node.body
+            assert isinstance(exc_wrapper_call_node, ast.Call)
+            bool_wrapper_call_node.args = if_node.test.values
+            exc_wrapper_call_node.args = [
+                _reuse_loc(bool_wrapper_call_node),
+                exc_msg,
+            ]
+        else:
+            exc_wrapper_call_node.args = [if_node.test, exc_msg]
+
+        # Ensure that the new node conforms to the Python AST grammar
+        expr_wrapper = _reuse_loc(ast.Expr(_reuse_loc(exc_wrapper_call_node)))
+
+        # Return the new node to signify that we want to use it as a replacement
+        # for the original `If x: Raise y` pattern.
+        return expr_wrapper
+
+
+class ConditionalExceptionWrapper(nn.Module):
+    """
+    This wrapper class is used to wrap conditional raising of exceptions during
+    rewriting. For example:
+
+    .. code-block:: python
+
+        if self.name != "x":
+            raise AssertionError(f"Name was not x: {self.name}")
+
+    Is rewritten into
+
+    .. code-block:: python
+
+        self._conditional_exception_wrapper_AssertionError(
+            self.name != "x", f"Name was not x: {self.name}"
+        )
+
+    Note that __init__ takes the Exception class that it is wrapping, while
+    forward takes the condition to check and the message for the exception.
+
+    """
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(self, exc: Type[Exception]):
+        super().__init__()
+        self.exc = exc
+
+    def forward(self, cond: bool, msg: str):
+        if cond:
+            raise self.exc if msg is None else self.exc(msg)
+
+
+class ConditionalExceptionBoolCondWrapper(nn.Module):
+    """
+    This is a wrapper class to for boolean ops used inside conditionals
+    raising exceptions.
+    This currently only handles binary input cases for the `and` operator
+    at one level of depth
+    For example:
+
+    .. code-block:: python
+
+    if self.name != "x" and self.name != "y":
+        raise AssertionError(f"Name was not x: {self.name}")
+
+    rewrites the `self.name != "x" and self.name != "y"` with
+    a `_conditional_exception_wrapper_AssertionError_bool` as follows:
+
+    .. code-block:: python
+
+        self._conditional_exception_wrapper_AssertionError(
+            self._conditional_exception_wrapper_AssertionError_bool(self.name != "x" and self.name != "y"), f"Name was not x: {self.name}"
+        )
+    """
+
+    # Mark as impure so that calls to it will not be removed during DCE.
+    _is_impure = True
+
+    def __init__(self, op):
+        super().__init__()
+
+    def forward(self, *conds: Iterable):
+        return all(conds)
+
+
+# Custom tracer that traces to the functional level and rewrites asserts and
+# exceptions.
+class AccRewritingTracer(Tracer):
+    # Add an explicit check for mutable operations, which break symbolic tracing.
+    check_mutable_operations = True
+    # Disble proxying buffers, which currently breaks some quantization code
+    proxy_buffer_attributes = False
+
+    # Note: Treat ConditionalExceptionWrapper as a leaf so that we don't
+    # trace into it, because it contains control flow and raises an exception.
+    DEFAULT_LEAF_MODULE_LIST = {
+        ConditionalExceptionBoolCondWrapper,
+        ConditionalExceptionWrapper,
+        torch.nn.quantized.Linear,
+        torch.nn.quantized.Conv2d,
+        torch.nn.intrinsic.quantized.ConvReLU2d,
+        jit.ScriptModule,
+        jit.RecursiveScriptModule,
+        torch.nn.modules.activation.MultiheadAttention,
+    }
+
+    def is_leaf_module(self, m: nn.Module, mod_qual_name: str) -> bool:
+        return getattr(m, "_base_class_origin", type(m)) in self.leaf_module_list
+
+    def trace(
+        self,
+        root: nn.Module,
+        concrete_args: Optional[Dict[str, Any]] = None,
+        ast_rewriter_allow_list: Optional[Set] = None,
+        leaf_module_list: Optional[Set] = None,
+    ) -> Tuple[Graph, nn.Module]:
+        self.leaf_module_list = self.DEFAULT_LEAF_MODULE_LIST
+        if leaf_module_list:
+            self.leaf_module_list.update(leaf_module_list)
+        rewritten = _rewrite(root, ast_rewriter_allow_list, self.leaf_module_list)
+        return super().trace(rewritten, concrete_args), rewritten
+
+    # override TraceBase's method
+    def create_node(
+        self,
+        kind: str,
+        target: Target,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        name: Optional[str] = None,
+        type_expr: Optional[Any] = None,
+    ) -> Node:
+        """
+        Inserts a graph node given target, args, kwargs, and name.
+
+        This method can be overridden to do extra checking, validation, or
+        modification of values used in node creation. For example, one might
+        want to disallow in-place operations from being recorded.
+        """
+
+        ## Hacky way to decide inplace ops
+        if type(target) != str:
+            name_target = target.__name__
+        else:
+            name_target = target
+
+        allow_list = ["and_", "or_"]  # python  operator.and_,  operator.or_
+        if (
+            name_target[-1] == "_"
+            and name_target[0] != "_"
+            and not (name_target in allow_list)
+            and kind != "placeholder"
+        ):
+            raise RuntimeError(
+                f"Tried to trace mutable operation {name_target}. FX only supports functional code"
+            )
+
+        return self.graph.create_node(kind, target, args, kwargs, name, type_expr)
+
+
+# List of modules that need rewriting to be supported for tracing.
+DEFAULT_REWRITE_ALLOW_LIST = {
+    nn.BatchNorm1d,
+    nn.BatchNorm2d,
+    nn.BatchNorm3d,
+}
+
+
+def _rewrite(
+    mod_to_rewrite: nn.Module,
+    allow_list: Optional[Set] = None,
+    leaf_module_list: Optional[Set] = None,
+) -> nn.Module:
+    if allow_list is None:
+        allow_list = DEFAULT_REWRITE_ALLOW_LIST
+    else:
+        allow_list = allow_list.union(DEFAULT_REWRITE_ALLOW_LIST)
+
+    if not leaf_module_list:
+        leaf_module_list = set()
+
+    # Rewrite this module's functions as well as all recursive modules'
+    # functions that are attrs of this moodule. Return the new, rewritten module
+    # hierarchy.
+    def rewrite_module(m: nn.Module):
+        if isinstance(m, jit.ScriptModule):
+            # ScriptModule cannot be rewritten, so bypass it. The issue is it
+            # requires explicitly calling its `__init__()`, calling
+            # `nn.Module.__init__()` in the derived `RewrittenModule` is not
+            # enough. And even if we init it we can't do much with it.
+            return m
+
+        # If m is an already-rewritten RewrittenModule, then use the original base class.
+        base_class: Type[nn.Module] = getattr(m, "_base_class_origin", type(m))
+
+        # Keep track of all the ConditionalExceptionWrappers that the
+        # Acc_Rewriter calls into in this module so we can add them in init
+        # below.
+        all_added_wrappers: Set[Type[Exception]] = set()
+        all_added_bool_wrappers: Set[Type[Exception]] = set()
+
+        # Note: Make this a subclass of our base class.
+        class RewrittenModule(base_class):  # type: ignore[valid-type, misc]
+            # Keep track of the base_class so that symbolic tracing can
+            # determine what kind of module this originally was later on.
+            _base_class_origin = base_class
+            # Add suffix to qualname so it's easier to debug the origin of this module.
+            __qualname__ = f"{base_class.__qualname__}__AccRewrittenModule"
+
+            # Write all of the non-dunder or special methods from base_class
+            # into RewrittenModule.
+            for method_name in dir(base_class):
+                method = getattr(base_class, method_name, None)
+                if method is None and method_name not in {"__doc__"}:
+                    _LOGGER.warning(
+                        f"{__qualname__} does not have attribute {method_name}"
+                    )
+
+                if builtins.type(method) is not FunctionType:
+                    continue
+
+                # Always skip rewriting dunder methods, as they haven't (yet) been
+                # problematic, and modifying them has caused issues previously.
+                if method_name.startswith("__") and method_name.endswith("__"):
+                    continue
+
+                # Only rewrite those Modules explicitly in the allow_list.
+                assert allow_list is not None
+                if base_class not in allow_list:
+                    vars()[method_name] = method
+                else:
+                    (
+                        vars()[method_name],
+                        added_wrappers,
+                        added_bool_wrappers,
+                    ) = Acc_Rewriter().rewrite(method)
+                    all_added_wrappers.update(added_wrappers)
+                    all_added_bool_wrappers.update(added_bool_wrappers)
+
+            def __init__(self, orig):
+                nn.Module.__init__(self)
+
+                # Iterate over all added exception wrappers and add
+                # ConditionalExceptionWrapper attrs for each.
+                for exc_type in all_added_wrappers:
+                    wrapper_name = _get_exception_wrapper_attr_name(exc_type)
+                    assert not hasattr(self, wrapper_name)
+                    setattr(
+                        self,
+                        wrapper_name,
+                        ConditionalExceptionWrapper(exc_type),
+                    )
+
+                for exc_type in all_added_bool_wrappers:
+                    wrapper_name = f"{_get_exception_wrapper_attr_name(exc_type)}_bool"
+                    assert not hasattr(self, wrapper_name)
+                    setattr(
+                        self,
+                        wrapper_name,
+                        ConditionalExceptionBoolCondWrapper(exc_type),
+                    )
+
+                # Recursively rewrite and copy all module attrs of this module.
+                for k, v in orig.__dict__.items():
+                    if k == "_modules":
+                        for mod_k, mod_v in v.items():
+                            if getattr(mod_v, "_base_class_origin", type(mod_v)) in leaf_module_list:  # type: ignore[operator]
+                                _LOGGER.info(
+                                    f"Skip rewriting leaf module {type(mod_v)}"
+                                )
+                                self._modules[mod_k] = mod_v
+                            else:
+                                self._modules[mod_k] = rewrite_module(mod_v)
+                    else:
+                        self.__dict__[k] = v
+
+        # Add suffix to name so it's easier to debug the origin of this module.
+        RewrittenModule.__name__ = f"{base_class.__name__}__AccRewrittenModule"
+        return RewrittenModule(m)
+
+    return rewrite_module(mod_to_rewrite)
+
+
+def _remove_assertions(gm: torch.fx.GraphModule) -> bool:
+    """
+    Unconditionally removes all assertions found in GraphModule gm.
+    Returns whether the graph is modified.
+    """
+    changed = False
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch._assert:
+            gm.graph.erase_node(node)
+            changed = True
+    return changed
+
+
+def _remove_exceptions(gm: torch.fx.GraphModule) -> bool:
+    """
+    Unconditionally removes all call_modules to ConditionalExceptionWrappers
+    found in GraphModule gm. Returns whether the graph is modified.
+    """
+    changed = False
+    for node in reversed(gm.graph.nodes):
+        if node.op == "call_module" and (
+            isinstance(gm.get_submodule(node.target), ConditionalExceptionWrapper)
+            or isinstance(
+                gm.get_submodule(node.target), ConditionalExceptionBoolCondWrapper
+            )
+        ):
+            gm.graph.erase_node(node)
+            changed = True
+    return changed
+
+
+def _replace_tensor_meta_with_rank(gm: torch.fx.GraphModule):
+    for node in gm.graph.nodes:
+        if node.op != "output" and "tensor_meta" in node.meta:
+            node.meta["tensor_rank"] = acc_utils.map_tensor_metadata(
+                node.meta["tensor_meta"], lambda x: len(x.shape)
+            )
+            del node.meta["tensor_meta"]
+
+
+def rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list):
+    rewritten_graph, rewritten_mod = AccRewritingTracer().trace(
+        mod,
+        ast_rewriter_allow_list=ast_rewriter_allow_list,
+        leaf_module_list=leaf_module_list,
+    )
+
+    assert isinstance(rewritten_mod, nn.Module)
+    # Note: use the rewritten_mod here as the root. This is necessary because
+    # RewrittenModule includes a new module for the ConditionalExceptionWrapper.
+    return torch.fx.GraphModule(rewritten_mod, rewritten_graph)
+
+
+def trace(
+    mod: nn.Module,
+    sample_inputs: Sequence[Any],
+    remove_assertions: bool = True,
+    remove_exceptions: bool = True,
+    use_acc_normalization: bool = True,
+    ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None,
+    leaf_module_list: Optional[Set[Type[nn.Module]]] = None,
+    acc_normalization_block_list: Optional[
+        Set[Tuple[str, Union[str, Callable]]]
+    ] = None,
+) -> torch.fx.GraphModule:
+    """
+    Performs tracing and arg normalization specialized for accelerator lowering.
+
+    It first rewrites the AST of the module's methods (and all attr methods
+    recursively) to transform un-tracable parts of the module to make them
+    traceable.
+
+    It then traces to the functional level so that optimizations and backend
+    accelerator importers have the ability to see and/or change inputs to each
+    op.
+
+    It then removes assertions and exception wrappers found during symbolic
+    tracing if requested based on remove_assertions and remove_exceptions
+
+    Dead code is then eliminated, which will e.g. remove any nodes that were
+    only used by assertions or exceptions if they were removed.
+
+    It then performs normalization on args/kwargs, aligning any arg that can be
+    moved to kwarg to be so, and then making default values explicit.
+
+    Args:
+
+        mod (Module): The module to transform and trace.
+
+        sample_inputs (Tuple[Union[torch.Tensor, List[torch.Tensor]]]):
+                Sample inputs with which to run shape prop.
+
+        remove_assertions (bool): Whether to remove assertion nodes from
+                                    the graph after symbolic tracing.
+
+        remove_exceptions (bool): Whether to remove exception wrapper nodes
+                                    from the graph after symbolic tracing.
+
+        use_acc_normalization (bool): Whether to use acc-specific
+                                        normalization to all acc_ops.
+
+        ast_rewriter_allow_list (Optional[Set[nn.Module]]): Optional allow list of
+                                            modules that need AST rewriting.
+
+        leaf_module_list (Optional[Set[nn.Module]]): Optional leaf module list where
+                                            modules will not be traced into.
+
+        acc_normalization_block_list (Optional[Set[Tuple[str, Union[str, Callable]]]]):
+                                    Optional set of (op, target) pairs to not apply acc
+                                    normalization to. Just like the register_acc_op decarators,
+                                    the target can either be a string (e.g. for op == "call_method")
+                                    or a callable (e.g. for op == "call_function").
+    """
+    if mod.training:
+        warnings.warn(
+            "acc_tracer does not support currently support models for training."
+            " Calling eval on model before tracing."
+        )
+        mod.eval()
+
+    assert isinstance(sample_inputs, (list, tuple))
+
+    # Rewrite the module to make it symbolic traceable, and then trace it.
+    traced = rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list)
+
+    # Now remove all assertions and exceptions if requested.
+    if remove_assertions:
+        _remove_assertions(traced)
+    if remove_exceptions:
+        _remove_exceptions(traced)
+
+    # Cleanup any dead code from the original module as well as resulting dead
+    # nodes after removing assertions and exceptions.
+    traced.graph.eliminate_dead_code()
+    traced.recompile()
+
+    # Run shape prop to add node.meta["type"] to nodes, needed for NormalizeArgs.
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
+    # Swap out tensor_meta for tensor_rank, because we don't actually want to rely on
+    # tensor_meta yet for normalization/lowering, though rank shouldn't change.
+    _replace_tensor_meta_with_rank(traced)
+    # Now normalize args/kwargs to make default values visible. Leave args/kwargs as
+    # they were, since all-kwarg normalization is broken, and we don't need it anyway.
+    traced = NormalizeArgs(traced, normalize_to_only_use_kwargs=False).transform()
+
+    # Normalize to acc-specialized wrappers for consistency across op naming and
+    # ensuring all kwarg usage.
+    if use_acc_normalization:
+        acc_normalizer.normalize(
+            traced, acc_normalization_block_list=acc_normalization_block_list
+        )
+
+    traced.recompile()
+
+    # Run shape prop to again to populate tensor_meta after normalize.
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
+
+    return traced
diff --git a/fx2ait/fx2ait/acc_tracer/acc_utils.py b/fx2ait/fx2ait/acc_tracer/acc_utils.py
new file mode 100644
index 000000000..21646af9e
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/acc_utils.py
@@ -0,0 +1,215 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import inspect
+import logging
+import os
+import re
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.fx
+from torch.fx.immutable_collections import immutable_dict, immutable_list
+from torch.fx.node import _get_qualified_name
+from torch.fx.passes import graph_drawer
+from torch.fx.passes.shape_prop import TensorMetadata
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+def get_target_from_module(mod: torch.nn.Module, target: str):
+    """
+    Gets `target` from `mod` and returns it. If `target` is empty then returns `mod.`
+    """
+    if target == "":
+        return mod
+
+    target_atoms = target.split(".")
+    curr_obj = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(curr_obj, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target '{'.'.join(target_atoms[:i])}'; "
+                f" original whole target: '{target}'"
+            )
+        curr_obj = getattr(curr_obj, atom)
+    return curr_obj
+
+
+def get_attr(node: torch.fx.Node) -> Any:
+    """
+    Returns the underlying attr for a given node which
+    must be of type get_attr.
+    """
+    assert node.op == "get_attr", "Expected a get_attr node"
+    return get_target_from_module(node.graph.owning_module, str(node.target))
+
+
+def is_acc_op(node_or_target: Union[Callable, torch.fx.Node]) -> bool:
+    """
+    Returns whether `node_or_target` is an acc_op. If it's a node, then checks whether
+    it's a call_function target is from the acc_ops module. Otherwise it's already
+    the target, which is similarly checked to see if it's from the acc_ops module.
+    """
+    if isinstance(node_or_target, torch.fx.Node):
+        # All acc_ops are call_functions.
+        if node_or_target.op != "call_function":
+            return False
+        target = node_or_target.target
+    else:
+        target = node_or_target
+    return "acc_ops" in target.__module__
+
+
+def is_acc_op_with_kwarg(
+    node_or_target: Union[Callable, torch.fx.Node], kwarg: str
+) -> bool:
+    """
+    Helper that inspects `node_or_target` and returns whether it is an acc_op node
+    (or a target for an acc_op) that has an arg signature that includes `kwarg`.
+    """
+    if not is_acc_op(node_or_target):
+        return False
+
+    target = (
+        node_or_target.target
+        if isinstance(node_or_target, torch.fx.Node)
+        else node_or_target
+    )
+    assert not isinstance(target, str)
+    return kwarg in inspect.signature(inspect.unwrap(target)).parameters
+
+
+def build_raw_tensor_meta(
+    shape=None,
+    dtype=None,
+    requires_grad=None,
+    stride=None,
+    memory_format=None,
+    is_quantized=None,
+    qparams=None,
+):
+    return TensorMetadata(**locals())
+
+
+def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph"):
+    base, ext = os.path.splitext(fname)
+    if not ext:
+        ext = ".svg"
+    _LOGGER.info(f"Writing FX graph to file: {base}{ext}")
+    g = graph_drawer.FxGraphDrawer(traced, figname)
+    x = g.get_main_dot_graph()
+    try:
+        getattr(x, "write_" + ext.lstrip("."))(fname)
+    except OSError as e:
+        _LOGGER.error(f"Failed to write the FX graph due to: {e}")
+
+
+def get_model_info_str(gm: torch.fx.GraphModule, header: Optional[str] = None):
+    """
+    Print out info of the provided `gm`.
+    If `header` is provided then it's included in the printed string.
+    """
+    ops_and_counts: Dict[Callable, int] = {}
+    placeholder_count = get_attr_count = call_method_count = call_module_count = 0
+    for node in gm.graph.nodes:
+        if node.op == "call_function":
+            ops_and_counts[node.target] = ops_and_counts.get(node.target, 0) + 1
+        elif node.op == "placeholder":
+            placeholder_count += 1
+        elif node.op == "get_attr":
+            get_attr_count += 1
+        elif node.op == "call_method":
+            call_method_count += 1
+        elif node.op == "call_module":
+            call_module_count += 1
+        elif node.op == "output":
+            output_count = len(node.args[0]) if isinstance(node.args[0], tuple) else 1
+        else:
+            raise RuntimeError(f"Unknown node found: {node.format_node()}")
+
+    header = "" if header is None else f" [{header}]"
+    model_info_str = f"Model Info{header}:\n"
+    model_info_str += f"> placeholder: {placeholder_count}\n"
+    model_info_str += f"> get_attr: {get_attr_count}\n"
+    model_info_str += f"> output: {output_count}\n"
+    if call_module_count != 0:
+        model_info_str += f"> WARNING: call_module: {call_module_count}"
+    if call_method_count != 0:
+        model_info_str += f"> WARNING: call_method: {call_method_count}"
+
+    # Sort and print all the other ops. Sort so it's deterministic between runs and
+    # easier to parse.
+    pretty_ops_and_counts: List[Tuple[str, int]] = []
+    for op, count in ops_and_counts.items():
+        pretty_ops_and_counts.append((_get_qualified_name(op), count))
+    pretty_ops_and_counts.sort()
+    for op_str, count in pretty_ops_and_counts:
+        model_info_str += f"> {op_str}: {count}\n"
+
+    return model_info_str
+
+
+def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str:
+    """
+    Make sure the name is unique (in a module) and can represents an attr.
+    """
+    # Delete all characters that are illegal in a Python identifier.
+    name = re.sub("[^0-9a-zA-Z_]+", "_", name)
+    if name[0].isdigit():
+        name = f"_{name}"
+    # Now make sure it is in fact unique to the module by incrementing suffix value.
+    while hasattr(mod_traced, name):
+        match = re.match(r"(.*)_(\d+)$", name)
+        if match is None:
+            name = name + "_1"
+        else:
+            base, num = match.group(1, 2)
+            name = f"{base}_{int(num) + 1}"
+
+    return name
+
+
+def map_tensor_metadata(a: Any, fn: Callable):
+    """
+    Map some `fn` to `a`, where `a` is either a TensorMetadata, or else a tuple/list/dict
+    recursively containing TensorMetadata.
+    """
+    if isinstance(a, int):
+        return 1
+    elif a is None:
+        return 1
+    elif isinstance(a, TensorMetadata):
+        return fn(a)
+    elif isinstance(a, tuple):
+        return tuple(map_tensor_metadata(elem, fn) for elem in a)
+    elif isinstance(a, dict):
+        return immutable_dict(
+            {name: map_tensor_metadata(elem, fn) for name, elem in a.items()}
+        )
+    assert isinstance(
+        a, list
+    ), f"Only supporting tuple/list/TensorMetadata, but found {type(a)}"
+    return immutable_list(map_tensor_metadata(elem, fn) for elem in a)
+
+
+def get_tensor_meta(node: torch.fx.Node) -> TensorMetadata:
+    tensor_meta = node.meta.get("tensor_meta")
+
+    if not tensor_meta:
+        raise RuntimeError(
+            f"Node has no tensor metadata associated with it! "
+            f"Check that shape propagation has run. {node.format_node()}"
+        )
+    return tensor_meta
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
new file mode 100644
index 000000000..cd963547e
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_normalizer.py
@@ -0,0 +1,69 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .acc_ops import *  # isort:skip # noqa: F403 F401
+from .ait_acc_ops import *  # noqa: F403 F401
+import logging
+
+from .acc_normalizer import (
+    _normalization_dict,
+    register_acc_op_mapping,
+    register_custom_acc_mapper_fn,
+)
+
+from .ait_acc_ops_registry import get_ait_acc_op_mappers, get_custom_ait_acc_op_mappers
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def update_acc_op_mappers_for_ait() -> None:
+    """
+    This function allows to replace some of the default acc_ops mappers with
+    custom mappers. Custom mappers are defined in the 'ait_acc_ops.py' file.
+    """
+    ait_acc_op_mappers = get_ait_acc_op_mappers()
+    custom_ait_acc_op_mappers = get_custom_ait_acc_op_mappers()
+
+    logger.info(
+        "Found %s ait mappers, %s custom ait op mappers",
+        len(ait_acc_op_mappers),
+        len(custom_ait_acc_op_mappers),
+    )
+
+    for op_and_target, mapper in ait_acc_op_mappers.items():
+        if op_and_target in _normalization_dict:
+            logger.info("Removing %s from acc normalization dict", op_and_target)
+            del _normalization_dict[op_and_target]
+
+        logger.info("Adding AIT acc mapper for %s", op_and_target)
+        register_acc_op_mapping(
+            op_and_target,
+            mapper.arg_replacement_tuples,
+            mapper.kwargs_to_move_to_acc_out_ty,
+        )(mapper.new_fn_target)
+
+    for op_and_target, mapper in custom_ait_acc_op_mappers.items():
+        if op_and_target in _normalization_dict:
+            logger.info("Removing %s from acc normalization dict", op_and_target)
+            del _normalization_dict[op_and_target]
+
+        logger.info("Adding custom AIT acc mapper for %s", op_and_target)
+        register_custom_acc_mapper_fn(
+            op_and_target,
+            mapper.arg_replacement_tuples,
+            mapper.needs_shapes_for_normalization,
+            mapper.allow_normalize_from_torch_package,
+        )(mapper.custom_mapping_fn)
+
+    logger.info("Completed updating acc mappers")
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
new file mode 100644
index 000000000..56ceee311
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+from fx2ait.acc_tracer.acc_normalizer import register_acc_op
+
+from fx2ait.acc_tracer.ait_acc_ops_registry import ait_register_acc_op_mapping
+
+this_arg_is_optional: bool = True
+
+
+@ait_register_acc_op_mapping(
+    op_and_target=("call_method", "split"),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@ait_register_acc_op_mapping(
+    op_and_target=("call_function", torch.split),
+    arg_replacement_tuples=[
+        ("tensor", "input"),
+        ("split_size_or_sections", "split_size_or_sections"),
+        ("dim", "dim", this_arg_is_optional),
+    ],
+)
+@register_acc_op
+def split(*, input, split_size_or_sections, dim=0):
+    return torch.split(input, split_size_or_sections, dim)
diff --git a/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
new file mode 100644
index 000000000..6417d8431
--- /dev/null
+++ b/fx2ait/fx2ait/acc_tracer/ait_acc_ops_registry.py
@@ -0,0 +1,106 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union
+
+
+class AitAccOpMapper(NamedTuple):
+    new_fn_target: Callable
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ]
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ]
+
+
+class CustomAitAccOpMapper(NamedTuple):
+    custom_mapping_fn: Callable
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ]
+    needs_shapes_for_normalization: bool
+    allow_normalize_from_torch_package: bool
+
+
+_AIT_ACC_OP_MAPPERS: Dict[Tuple[str, Union[str, Callable]], AitAccOpMapper] = {}
+_CUSTOM_AIT_ACC_OP_MAPPERS: Dict[
+    Tuple[str, Union[str, Callable]], CustomAitAccOpMapper
+] = {}
+
+
+def ait_register_acc_op_mapping(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: Optional[
+        List[
+            Union[
+                Tuple[Union[str, Tuple[str, ...]], str],
+                Tuple[Union[str, Tuple[str, ...]], str, bool],
+            ]
+        ]
+    ] = None,
+    kwargs_to_move_to_acc_out_ty: Optional[
+        List[Union[Tuple[str, str, bool], Tuple[str, str]]]
+    ] = None,
+):
+    def insert(new_fn_target: Callable):
+        _AIT_ACC_OP_MAPPERS[op_and_target] = AitAccOpMapper(
+            new_fn_target=new_fn_target,
+            arg_replacement_tuples=arg_replacement_tuples,
+            kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty,
+        )
+        return new_fn_target
+
+    return insert
+
+
+def ait_register_custom_acc_mapper_fn(
+    op_and_target: Tuple[str, Union[str, Callable]],
+    arg_replacement_tuples: List[
+        Union[
+            Tuple[Union[str, Tuple[str, ...]], str],
+            Tuple[Union[str, Tuple[str, ...]], str, bool],
+        ]
+    ],
+    needs_shapes_for_normalization=False,
+    allow_normalize_from_torch_package=False,
+):
+    def insert(custom_mapping_fn: Callable):
+        _CUSTOM_AIT_ACC_OP_MAPPERS[op_and_target] = CustomAitAccOpMapper(
+            custom_mapping_fn=custom_mapping_fn,
+            arg_replacement_tuples=arg_replacement_tuples,
+            needs_shapes_for_normalization=needs_shapes_for_normalization,
+            allow_normalize_from_torch_package=allow_normalize_from_torch_package,
+        )
+        return custom_mapping_fn
+
+    return insert
+
+
+def get_ait_acc_op_mappers() -> Dict[Tuple[str, Union[str, Callable]], AitAccOpMapper]:
+    return _AIT_ACC_OP_MAPPERS
+
+
+def get_custom_ait_acc_op_mappers() -> (
+    Dict[Tuple[str, Union[str, Callable]], CustomAitAccOpMapper]
+):
+    return _CUSTOM_AIT_ACC_OP_MAPPERS
diff --git a/fx2ait/fx2ait/ait_module.py b/fx2ait/fx2ait/ait_module.py
new file mode 100644
index 000000000..9f83fe937
--- /dev/null
+++ b/fx2ait/fx2ait/ait_module.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import torch
+
+ARG_SPLITTER_KEYWORD = "a1T_ARg_SpliTTERKeyword"
+
+
+class AITModule(torch.nn.Module):
+    def __init__(
+        self,
+        engine=None,
+        interp_result=None,
+    ):
+        super(AITModule, self).__init__()
+        self.engine = engine
+
+        self.interp_result = interp_result
+        self.ait_arg_names = interp_result.input_names if interp_result else None
+        self.fx_arg_names = interp_result.fx_input_names if interp_result else None
+
+    def forward(self, *args, **kwargs):
+        ait_args = []
+        if self.interp_result:
+            offset = 0
+            for idx, fx_arg_name in enumerate(self.fx_arg_names):
+                arg_name, *arg_idx = fx_arg_name.split(ARG_SPLITTER_KEYWORD)
+                arg_idx = int(arg_idx[0]) if arg_idx else -1
+                # Offset for List[List[Tensor]]
+                offset += 1 if arg_idx > 0 else 0
+                if fx_arg_name in self.ait_arg_names:
+                    # Locate input from args.
+                    if idx - offset < len(args):
+                        arg_ref = args[idx - offset]
+                    # Locate input from kwargs.
+                    elif arg_name in kwargs:
+                        arg_ref = kwargs[arg_name]
+                    else:
+                        raise RuntimeError(f"Required input {fx_arg_name} not found")
+                    ait_args.append(arg_ref[arg_idx] if arg_idx > -1 else arg_ref)
+
+            assert len(ait_args) == len(self.ait_arg_names)
+        else:
+            # Flatten args and kwargs from List[Tensor or List[Tensor]] to List[Tensor]
+            all_args = list(args) + list(kwargs.values())
+            for arg in all_args:
+                ait_args.extend(arg if isinstance(arg, list) else [arg])
+
+        outputs = self.engine.forward(ait_args)
+        if len(outputs) == 1:
+            return outputs[0]
+        return tuple(outputs)
+
+    def profile(
+        self, inputs: List[torch.Tensor], filename: str, num_iters: int
+    ) -> None:
+        """
+        Profile the AIT module and save the report to a file. The AITModule
+        must be created with allow_scripting=False.
+        inputs: sample inputs
+        filename: report filename
+        num_iters: number of iterations per op run
+        """
+        self.engine.profile(inputs, filename, num_iters)
+
+    @staticmethod
+    def create_ait_module_wrapper(engine, interp_result, trace_ait_module, *inputs):
+        """
+        Some use cases need to torch.jit.script a model with AITModules in
+        it, but TorchScript does not support variadic inputs. We can get
+        around this by scripting the AITModule with some sample inputs.
+        This is turned in by passing allow_scripting=True.
+        """
+        mod = AITModule(engine, interp_result)
+        # sanity test before tracing
+        mod(*inputs)
+        return torch.jit.trace(mod, inputs) if trace_ait_module else mod
diff --git a/fx2ait/fx2ait/ait_splitter.py b/fx2ait/fx2ait/ait_splitter.py
new file mode 100644
index 000000000..15eb25f7e
--- /dev/null
+++ b/fx2ait/fx2ait/ait_splitter.py
@@ -0,0 +1,172 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Dict, Iterable, Mapping, Sequence
+
+import torch
+import torch.fx.passes.operator_support as ops
+import torch.fx.passes.splitter_base as splitter_base
+from torch.fx.passes.operator_support import create_op_support, OperatorSupportBase
+from torch.fx.passes.tools_common import get_acc_ops_name
+
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.ait_module import AITModule
+
+from fx2ait.converters.converter_registry import AIT_CONVERTERS
+from fx2ait.fx2ait import AITInterpreter
+
+
+_VIEW_OPS = frozenset(
+    (
+        acc_ops.unsqueeze,
+        acc_ops.squeeze,
+        acc_ops.reshape,
+        acc_ops.flatten,
+    )
+)
+
+DEFAULT_MIN_ACC_MODULE_SIZE = 10
+
+
+def _decline_if_would_trigger_extra_copies(
+    has_converter: OperatorSupportBase,
+) -> OperatorSupportBase:
+    def _impl(
+        submodules: Mapping[str, torch.nn.Module],
+        node: torch.fx.Node,
+    ):
+        def _any_supported(nodes: Sequence[torch.fx.Node]) -> bool:
+            return any(
+                has_converter.is_node_supported(submodules, node) for node in nodes
+            )
+
+        if node.target not in _VIEW_OPS:
+            return True
+
+        if _any_supported(node.users) or _any_supported(node.all_input_nodes):
+            return True
+
+        return False
+
+    return create_op_support(_impl)
+
+
+def create_ait_operator_support(
+    use_implicit_batch_dim=True,
+    op_lowering_disallow_list=None,
+    allow_int_inputs=False,
+    allow_op_supports=None,
+) -> ops.OperatorSupportBase:
+    """Creates an `OperatorSupportBase` instance used for AIT splitting purpose."""
+    # Create an `OperatorSupport` that declares a node supported if it
+    # finds a registered AIT converter.
+    support_dict: Dict[str, None] = {}
+    for k in AIT_CONVERTERS.keys():
+        # may need to switch the op name here
+        support_dict[get_acc_ops_name(k)] = None
+    supported_if_converter_registered = ops.OperatorSupport(support_dict=support_dict)
+
+    op_lowering_disallow_set = (
+        set() if op_lowering_disallow_list is None else set(op_lowering_disallow_list)
+    )
+    chained_not_supported_ops = (
+        []
+        if allow_int_inputs
+        else [
+            ops.OpSupports.decline_if_input_dtype(torch.int64),
+            ops.OpSupports.decline_if_input_dtype(torch.int32),
+        ]
+    )
+    chained_not_supported_ops += [
+        ops.OpSupports.decline_if_node_in_names(op_lowering_disallow_set),
+        # 1. We only support subgraphs with torch.Tensor inputs for now
+        ops.OpSupports.decline_if_input_dtype(torch.float64),
+        ops.OpSupports.decline_if_input_dtype(dict),
+        # 2. Node is supported if it has AIT converter:
+        supported_if_converter_registered,
+        # 3. Decline nodes that would trigger extra copies. This can happen if
+        # we have an output that is just a view of an input, for example.
+        # Note that this is not required for correctness, it is merely an
+        # optimization.
+        _decline_if_would_trigger_extra_copies(supported_if_converter_registered),
+    ]
+    if allow_op_supports:
+        return ops.any_chain(ops.chain(*chained_not_supported_ops), *allow_op_supports)
+    return ops.chain(*chained_not_supported_ops)
+
+
+class AITSplitterSettings(splitter_base._SplitterSettingBase):
+    # TODO: Fix this once pytorch nightly is updated
+    def __init__(
+        self, min_acc_module_size=DEFAULT_MIN_ACC_MODULE_SIZE, allow_int_inputs=False
+    ):
+        super().__init__()
+        self.min_acc_module_size = min_acc_module_size
+        self.exclude_support_node_name: set = set()
+        self.allow_int_inputs: bool = allow_int_inputs
+
+
+class AITSplitter(splitter_base._SplitterBase):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Sequence[Any],
+        operator_support: ops.OperatorSupportBase = None,
+        settings: AITSplitterSettings = None,
+    ):
+        if not settings:
+            settings = AITSplitterSettings()
+        if not operator_support:
+            operator_support = create_ait_operator_support(
+                op_lowering_disallow_list=settings.exclude_support_node_name,
+                allow_int_inputs=settings.allow_int_inputs,
+            )
+        else:
+            operator_support = ops.chain(
+                operator_support,
+                ops.OpSupports.decline_if_node_in_names(
+                    settings.exclude_support_node_name
+                ),
+            )
+        super().__init__(
+            module,
+            sample_input,
+            operator_support,
+            settings,
+            non_acc_submodule_name="_run_on_gpu_",
+        )
+
+    def _lower_model_to_backend(
+        self, mod: torch.fx.GraphModule, inputs: Iterable[torch.Tensor]
+    ):
+        """
+        Lower a GraphModule `mod` to AITemplate with `inputs`.
+        """
+        # Current code for lowering is place-holder, subject to future change
+        # based on feeds model's actual status
+        interp = AITInterpreter(mod, [inputs])
+        interpreter_result = interp.run(*inputs)
+        return AITModule(
+            torch.classes.fb.AITModel(
+                interpreter_result.engine.lib_path,
+                interpreter_result.input_names,
+                interpreter_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  # num_runtimes
+            ),
+            interpreter_result,
+        )
+
+    # TODO add _find_culprit once minimizer completed
diff --git a/fx2ait/fx2ait/cache.py b/fx2ait/fx2ait/cache.py
new file mode 100644
index 000000000..e87d286ad
--- /dev/null
+++ b/fx2ait/fx2ait/cache.py
@@ -0,0 +1,27 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os.path as path
+
+
+def save_profile_cache(remote_cache_file_path, cache_path):
+    with open(cache_path, "rb") as f:
+        with open(remote_cache_file_path, "wb") as target:
+            target.write(f.read())
+
+
+def load_profile_cache(remote_cache_file_path, cache_bytes):
+    if path.isfile(remote_cache_file_path):
+        with open(remote_cache_file_path, "rb") as cache_content:
+            cache_bytes.write(cache_content.read())
diff --git a/fx2ait/fx2ait/converters/__init__.py b/fx2ait/fx2ait/converters/__init__.py
new file mode 100644
index 000000000..c2c99763d
--- /dev/null
+++ b/fx2ait/fx2ait/converters/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .ait_converters import *  # noqa: F401 F403
+from .aten2ait_converters import *  # noqa: F401 F403
+from .ait_module_converters import *  # noqa: F401 F403
+from .utils import set_tensor_layout_policy  # noqa: F401
diff --git a/fx2ait/fx2ait/converters/ait_converters.py b/fx2ait/fx2ait/converters/ait_converters.py
new file mode 100644
index 000000000..b92bcc17b
--- /dev/null
+++ b/fx2ait/fx2ait/converters/ait_converters.py
@@ -0,0 +1,1818 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import math
+import operator
+from typing import Dict, List, Sequence, Tuple, Union
+
+import torch
+
+from aitemplate.compiler.public import (
+    avg_pool2d,
+    bmm_rrr,
+    chunk,
+    clamp,
+    concatenate,
+    conv2d,
+    conv2d_bias,
+    conv3d,
+    conv3d_bias,
+    depthwise_conv3d,
+    dynamic_slice,
+    elementwise,
+    expand,
+    flatten,
+    full,
+    FuncEnum,
+    gemm_rcr,
+    gemm_rrr,
+    getitem,
+    group_norm,
+    identity,
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    layernorm,
+    masked_select,
+    max_pool2d,
+    ndhwc3to8,
+    pad_last_dim,
+    permute,
+    reduce_mean,
+    reduce_sum,
+    reshape,
+    size,
+    softmax,
+    split,
+    squeeze,
+    Tensor as AITTensor,
+    topk,
+    transposed_conv2d,
+    transposed_conv2d_bias,
+    tuple_construct,
+    unsqueeze,
+    var,
+    vector_norm,
+)
+
+from aitemplate.frontend.nn import Upsampling2d
+
+from fx2ait.acc_tracer import acc_ops, ait_acc_ops
+from torch.fx.node import Argument, Target
+
+from fx2ait.acc_tracer import acc_ops, ait_acc_ops
+
+from .converter_registry import ait_converter
+
+from .utils import (
+    ait_ncdhw2ndhwc,
+    ait_nchw2nhwc,
+    ait_ncl2nlc,
+    ait_ndhwc2ncdhw,
+    ait_nhwc2nchw,
+    ait_nlc2ncl,
+    create_binary_op,
+    create_reduce_op,
+    create_unary_op,
+    get_positive_dim,
+    identical_elem_tuple_to_int,
+    ncdhw2ndhwc,
+    nchw2nhwc,
+    weight_ncdhw2ndhwc,
+    weight_nchw2nhwc,
+)
+
+USE_ROCM = detect_target().name() == "rocm"
+
+logger: logging.Logger = logging.getLogger(__name__)
+ConverterOutput = Union[AITTensor, Tuple[AITTensor, ...], List[IntVar], IntVar]
+
+
+@ait_converter(acc_ops.sigmoid)
+def acc_ops_sigmoid(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    return elementwise(FuncEnum.SIGMOID)(input_val)
+
+
+@ait_converter(acc_ops.mul)
+def acc_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.MUL, args, kwargs, name)
+
+
+@ait_converter(acc_ops.div)
+def acc_ops_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.DIV, args, kwargs, name)
+
+
+@ait_converter(acc_ops.floor_div)
+def acc_ops_floor_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.FLOOR_DIV, args, kwargs, name)
+
+
+@ait_converter(acc_ops.add)
+def acc_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.ADD, args, kwargs, name)
+
+
+@ait_converter(acc_ops.sub)
+def acc_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_binary_op(FuncEnum.SUB, args, kwargs, name)
+
+
+@ait_converter(acc_ops.tanh)
+def acc_ops_tanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.TANH)(input_val)
+
+
+@ait_converter(acc_ops.sin)
+def acc_ops_sin(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.SIN)(input_val)
+
+
+@ait_converter(acc_ops.cos)
+def acc_ops_cos(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.COS)(input_val)
+
+
+@ait_converter(acc_ops.sqrt)
+def acc_ops_sqrt(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return elementwise(FuncEnum.SQRT)(input_val)
+
+
+@ait_converter(acc_ops.clone)
+def acc_ops_clone(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    # deepcopy results with an error. replace with Idnetity multiplication by 1.
+    # TODO: implement __deepcopy__ / clone for AITTensor.
+    one_const = AITTensor(
+        shape=[], dtype=input_val.dtype(), name="one_const", value=1.0
+    )
+    identity_mul_result = elementwise(FuncEnum.MUL)(input_val, one_const)
+    return identity_mul_result
+
+
+@ait_converter(acc_ops.sum)
+def acc_ops_sum(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_reduce_op(reduce_sum, args, kwargs, name)
+
+
+@ait_converter(acc_ops.mean)
+def acc_ops_mean(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    return create_reduce_op(reduce_mean, args, kwargs, name)
+
+
+@ait_converter(acc_ops.linear)
+def acc_ops_linear(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if USE_ROCM:
+        shape = input_val._attrs["shape"]
+        input_val = (
+            input_val if len(shape) == 2 else reshape()(input_val, [-1, shape[-1]])
+        )
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+
+    result = gemm_rcr()(input_val, weight)
+
+    bias = kwargs["bias"]
+    if bias is not None:
+        assert isinstance(bias, AITTensor)
+        result = elementwise(FuncEnum.ADD)(result, bias)
+    if USE_ROCM:
+        result = (
+            result
+            if len(shape) == 2
+            else reshape()(result, [shape[0], -1, result._attrs["shape"][-1]])
+        )
+    return result
+
+
+@ait_converter(acc_ops.unsqueeze)
+def acc_ops_unsqueeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return unsqueeze(dim)(input_val)
+
+
+@ait_converter(acc_ops.clamp)
+def acc_ops_clamp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    result = input_val
+    min_val = kwargs.get("min")
+    max_val = kwargs.get("max")
+    return clamp()(result, min_val, max_val)
+
+
+@ait_converter(acc_ops.linalg_norm)
+def acc_ops_linalg_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    if (
+        isinstance(kwargs["dim"], int)
+        and "ord" in kwargs
+        and kwargs["ord"] != 2
+        and kwargs["ord"] is not None
+    ):
+        # If dim is an int, the vector norm will be computed.
+        # For vector norm, the default ord is 2 if not specified
+        # otherwise, AIT hasn't implement it
+        raise RuntimeError("AIT linalg_norm only supports ord=2 use case!")
+
+    if not isinstance(kwargs["dim"], int) and (
+        "ord" not in kwargs or kwargs["ord"] != 2
+    ):
+        raise RuntimeError("AIT linalg_norm only supports ord=2 use case!")
+
+    # Hard code ord_kind=2 for l2 norm
+    l2_norm = vector_norm(
+        ord_kind=2, dim=kwargs["dim"], keepdim=kwargs["keepdim"], dtype=None
+    )
+
+    return l2_norm(input_val)
+
+
+@ait_converter(acc_ops.permute)
+def acc_ops_permute(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    permutation = kwargs["permutation"]
+
+    return permute()(input_val, permutation)
+
+
+@ait_converter(acc_ops.cat)
+def acc_ops_cat(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = kwargs["tensors"]
+    for t in tensors:
+        if not isinstance(t, AITTensor):
+            raise ValueError(f"Non-tensor inputs for {name}: {tensors}")
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return concatenate()(tensors, dim=dim)
+
+
+@ait_converter(acc_ops.sign)
+def acc_ops_sign(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.SIGN)(input_val)
+
+
+@ait_converter(acc_ops.abs)
+def acc_ops_abs(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.ABS)(input_val)
+
+
+@ait_converter(acc_ops.exp)
+def acc_ops_exp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.EXP)(input_val)
+
+
+@ait_converter(acc_ops.log)
+def acc_ops_log(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.LOGE)(input_val)
+
+
+@ait_converter(acc_ops.var)
+def acc_ops_var(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    op = var(
+        dim=kwargs["dim"],
+        unbiased=kwargs["unbiased"],
+        keepdim=kwargs["keepdim"],
+        dtype=None,
+    )
+    return op(input_val)
+
+
+@ait_converter(acc_ops.softmax)
+def acc_ops_softmax(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"]
+    rank = len(input_val.shape())
+    if dim < 0:
+        dim = rank + dim
+    if dim != rank - 1:
+        for i in range(dim + 1, rank):
+            unsupported = False
+            if isinstance(input_val.shape()[i], IntImm):
+                if input_val.shape()[i].value() != 1:
+                    unsupported = True
+            elif isinstance(input_val.shape()[i], IntVar):
+                unsupported = True
+            else:
+                raise RuntimeError(
+                    f"unknown dimension type={type(i)} in AITTensor={input_val}"
+                )
+
+            if unsupported:
+                raise ValueError(
+                    f"AIT softmax only supports dim=rank-1, got AITTensor={input_val}, "
+                    f"where dim={dim}, rank={rank}"
+                )
+        reshape_dim = size()(input_val)[: dim + 1]
+        reshape_val = reshape()(input_val, reshape_dim)
+        softmax_val = softmax()(reshape_val, -1)
+        return reshape()(
+            softmax_val, reshape_dim + [IntVarTensor(IntImm(1))] * (rank - dim - 1)
+        )
+
+    return softmax()(input_val, dim)
+
+
+@ait_converter(acc_ops.relu)
+def acc_ops_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.RELU)(input_val)
+
+
+@ait_converter(acc_ops.leaky_relu)
+def acc_ops_leaky_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    negative_slope = kwargs["negative_slope"]
+    return elementwise(FuncEnum.LRELU)(input_val, negative_slope)
+
+
+@ait_converter(acc_ops.squeeze)
+def acc_ops_squeeze(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    dim = kwargs["dim"] if "dim" in kwargs else None
+    op = squeeze(dim)
+    return op(input_val)
+
+
+@ait_converter(acc_ops.size)
+def acc_ops_size(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    if "dim" in kwargs:
+        raise NotImplementedError(
+            f"In {name} found 'dim' in size() which is not supported"
+        )
+
+    return size()(input_val)
+
+
+@ait_converter(acc_ops.unbind)
+def acc_ops_unbind(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    dim = kwargs["dim"]
+    shape = input_val.shape()
+    res = []
+    if dim < 0:
+        dim = len(shape) + dim
+    for cnt in range(shape[dim].value()):
+        idx = []
+        for i in range(len(shape)):
+            if i != dim:
+                idx.append(slice(None, None, None))
+            else:
+                idx.append(cnt)
+        kwargs_new = {
+            "input": input_val,
+            "idx": tuple(idx),
+        }
+        res.append(acc_ops_getitem(target, args, kwargs_new, name))
+    return res
+
+
+@ait_converter(operator.getitem)
+@ait_converter(acc_ops.getitem)
+def acc_ops_getitem(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # operator.getitem does not have kwargs. We copy args to kwargs so the downstream like acc_ops_slice can use it.
+    new_kwargs = dict(kwargs)
+    if "input" not in kwargs:
+        new_kwargs["input"] = args[0]
+    if "idx" not in kwargs:
+        new_kwargs["idx"] = args[1]
+    kwargs = new_kwargs
+    input_val = kwargs["input"]
+    idx = kwargs["idx"]
+    if isinstance(idx, Sequence) and any(isinstance(x, Sequence) for x in idx):
+        count = 0
+        dim = None
+        s = None
+        for d, x in enumerate(idx):
+            if isinstance(x, Sequence):
+                count += 1
+                dim = d
+                s = x
+        # TODO: Because multi-list concatenations e.g. x[[0,1],[0,2]] have broadcast implications
+        # which requires careful pre-conditions and complicated calculations,
+        # we ignore the situation for now and may add support per request.
+        assert count == 1, "Expected only one dimension with list concatenation."
+
+        # For list concatenations, we first take slices and then concate them back
+        # In terms of performance, AIT backend will take care of fusing these ops.
+        groups = []
+        kw = {"input": input_val}
+        start_idx = 0
+        end_idx = 1
+        while end_idx < len(s):
+            if s[end_idx] - s[start_idx] == end_idx - start_idx:
+                end_idx += 1
+                continue
+            else:
+                kw["idx"] = (
+                    idx[:dim]
+                    + (slice(s[start_idx], s[end_idx - 1] + 1, None),)
+                    + idx[dim + 1 :]
+                )
+                groups.append(acc_ops_slice(target, args, kw, name))
+                start_idx = end_idx
+                end_idx += 1
+        kw["idx"] = (
+            idx[:dim]
+            + (slice(s[start_idx], s[end_idx - 1] + 1, None),)
+            + idx[dim + 1 :]
+        )
+        groups.append(acc_ops_slice(target, args, kw, name))
+        return concatenate()(groups, dim=dim)
+
+    if isinstance(idx, slice) or (
+        isinstance(idx, Sequence) and any(isinstance(x, slice) for x in idx)
+    ):
+        return acc_ops_slice(target, args, kwargs, name)
+    if isinstance(input_val, AITTensor):
+        return acc_ops_slice(target, args, kwargs, name)
+
+    if isinstance(kwargs["idx"], int):
+        idx = get_positive_dim(idx, len(input_val))
+
+    if all(isinstance(i, IntImm) for i in input_val):
+        return operator.getitem(input_val, kwargs["idx"])
+    else:
+        return getitem()(input_val, idx)
+
+
+@ait_converter(acc_ops.slice_tensor)
+def acc_ops_slice(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    idx = kwargs["idx"]
+    if isinstance(input_val, (tuple, list)):
+        return operator.getitem(input_val, idx)
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    rank = input_val._rank()
+    if not isinstance(idx, Sequence):
+        idx = [idx]
+    op = dynamic_slice()
+
+    def num_slice_types(slices):
+        return sum(1 for s in slices if isinstance(s, slice) or isinstance(s, int))
+
+    # Replace ellipsis with expand slices.
+    num_ellipsis = rank - num_slice_types(idx)
+    expand_idx = []
+    for i in idx:
+        if i == Ellipsis:
+            # pass explicit start to guard against negative num_ellipsis
+            for _ in range(0, num_ellipsis):
+                expand_idx.append(slice(None, None, None))
+        else:
+            expand_idx.append(i)
+    idx = expand_idx
+
+    # Record indices that need to be either:
+    #   (1) sequeezed if Slice-index is of int; or
+    #   (2) unsqueezed if Slice-index is of None
+    # Each element of the list is a tuple of (int, func), where the second item
+    # is either squeeze or unsqueeze function and the first
+    # item gives the index to be squeezed or unsqueezed.
+    squeezable_indices = []
+    # the number of the indices of type None
+    num_none_indices = 0
+    start, end = [], []
+    for index, i in enumerate(idx):
+        if i is None:
+            squeezable_indices.append((index, unsqueeze))
+            num_none_indices += 1
+            continue
+        if isinstance(i, int):
+            if isinstance(input_val.shape()[index], IntImm):
+                i = get_positive_dim(i, input_val.shape()[index].value())
+            # If we pass an int, we need to squeeze this dim.
+            # Note that because we skip None-indices before, so we adjust
+            # the index by subtracting the number of None-indices.
+            squeezable_indices.append((index - num_none_indices, squeeze))
+        # if idx is slice, AIT only support slice.step == 1
+        # TODO remove check once slice support step != 1
+        if isinstance(i, slice) and i.step not in (1, None):
+            raise ValueError(
+                f"Slice tensor only support step=1 case, get step={i.step}."
+            )
+        start.append(i.start if isinstance(i, slice) else i)
+        end.append(i.stop if isinstance(i, slice) else (i + 1 if i is not None else i))
+
+    # append hiden dim at end
+    while len(start) < rank:
+        start.append(0)
+        end.append(None)
+
+    output = op(input_val, start, end)
+    for dim, squeeze_func in reversed(squeezable_indices):
+        # TODO: fix None for a more general case.
+        # unsqueeze(dim=-1) to unsqueeze the most inner dimension
+        if dim > rank and squeeze_func == unsqueeze:
+            dim = -1
+        output = squeeze_func(dim)(output)
+    return output
+
+
+@ait_converter(acc_ops.reshape)
+def acc_ops_reshape(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    shape = kwargs["acc_out_ty"].shape
+
+    return reshape()(input_val, shape)
+
+
+# TODO (T124248862)
+# We are waiting for full support of topk including:
+# actual return values
+# dim,
+# largest flag,
+# sorted flag
+@ait_converter(acc_ops.topk)
+def acc_ops_topk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    k = kwargs["k"]
+    if not isinstance(k, int):
+        raise ValueError(f"Unexpected value for k in {name}: {k}")
+
+    dim = kwargs["dim"] if "dim" in kwargs else None
+    if dim is not None and dim != -1:
+        raise NotImplementedError(
+            f"Found 'dim' in {name} which is not supported: {dim}"
+        )
+
+    largest = kwargs["largest"] if "largest" in kwargs else None
+    if largest is not None and largest is not True:
+        raise NotImplementedError(
+            f"Found 'largest' in {name} which is not supported: {largest}"
+        )
+
+    # current AIT implementation only returns indices, so 'sorted' does not apply. Ignore if specified.
+    sorted = kwargs["sorted"] if "sorted" in kwargs else None
+    if sorted is not None:
+        logger.warning("Ignoring the value of 'sorted': %s", sorted)
+
+    result = topk(k=k)(input_val)
+    return result
+
+
+@ait_converter(acc_ops.tuple_construct)
+def acc_ops_tuple_construct(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = kwargs["tensors"]
+    return tuple_construct()(*tensors)
+
+
+@ait_converter(acc_ops.conv_transpose2d)
+def acc_ops_conv_transpose2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    output_padding = identical_elem_tuple_to_int(kwargs["output_padding"])
+    assert output_padding == 0, "output_padding is not 0!"
+
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+    weight = weight_nchw2nhwc(weight)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+    w_last_dim = weight._attrs["data"].tensor.shape[-1]
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
+    assert dilation == 1, "dilation {dilation} does not equal to 1!"
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if kwargs["groups"] is None or kwargs["groups"] == 1:
+        assert (
+            w_last_dim % 8 == 0
+        ), f"cutlass needs weight output channel={w_last_dim} is not divisble by 8! This restriction may be not valid in newer version"
+
+        if bias:
+            result = transposed_conv2d_bias(
+                stride=stride, pad=padding, dilate=dilation
+            )(input_val, weight, bias)
+        else:
+            result = transposed_conv2d(stride=stride, pad=padding, dilate=dilation)(
+                input_val, weight
+            )
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        groups = kwargs["groups"]
+        assert (
+            w_last_dim * groups
+        ) % 8 == 0, f"cutlass needs weight output channel={w_last_dim*groups} is not divisble by 8! This restriction may be not valid in newer version"
+
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def get_channel_dim_slice_idx(start, end, step):
+            all_none_slice = slice(None, None, None)
+            return (
+                all_none_slice,
+                all_none_slice,
+                all_none_slice,
+                slice(start, end, step),
+            )
+
+        def get_batch_dim_slice_idx(start, end, step):
+            return (slice(start, end, step),)
+
+        def make_slice(x, slice_idx, name):
+            return acc_ops_slice(
+                target,
+                args,
+                {
+                    "input": x,
+                    "idx": slice_idx,
+                },
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    get_channel_dim_slice_idx(
+                        i * group_size, i * group_size + group_size, 1
+                    ),
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.weight.slice_{i}",
+                ),
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.bias.slice_{i}",
+                ),
+                transposed=True,
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    return ait_nhwc2nchw(result)
+
+
+@ait_converter(acc_ops.nan_to_num)
+def acc_ops_nan_to_num(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    nan = 0 if kwargs["nan"] is None else kwargs["nan"]
+
+    def _get_dtype(dtype: str):
+        if dtype in ("float", "float32"):
+            return torch.float32
+        elif dtype in ("half", "float16"):
+            return torch.float16
+        elif dtype == "bfloat16":
+            return torch.bfloat16
+        else:
+            raise NotImplementedError(f"Unsupported dtype {dtype} for nan_to_num")
+
+    input_dtype = input_val.dtype()
+    torch_dtype = _get_dtype(input_dtype)
+    posinf = (
+        torch.finfo(torch_dtype).max if kwargs["posinf"] is None else kwargs["posinf"]
+    )
+    neginf = (
+        torch.finfo(torch_dtype).min if kwargs["neginf"] is None else kwargs["neginf"]
+    )
+    return elementwise(FuncEnum.NAN_TO_NUM)(
+        input_val,
+        AITTensor(value=nan, shape=[], name="nan", dtype=input_dtype),
+        AITTensor(value=posinf, shape=[], name="posinf", dtype=input_dtype),
+        AITTensor(value=neginf, shape=[], name="neginf", dtype=input_dtype),
+    )
+
+
+@ait_converter(acc_ops.group_norm)
+def acc_ops_group_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    num_groups = kwargs["num_groups"]
+    weight_val = kwargs["weight"]
+    bias_val = kwargs["bias"]
+    eps_val = kwargs["eps"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    num_channels = input_val.shape()[-1].value()
+    op = group_norm(num_groups, num_channels)
+    result = op(input_val, weight_val, bias_val, eps_val)
+    return ait_nhwc2nchw(result)
+
+
+@ait_converter(acc_ops.layer_norm)
+def acc_ops_layer_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = kwargs["normalized_shape"]
+    if shape is None or len(shape) == 0:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    weight = kwargs["weight"]
+    bias = kwargs["bias"]
+    eps = kwargs["eps"]
+    normalized_shape = []
+    if all(isinstance(i, int) for i in shape):
+        for i in shape:
+            normalized_shape.append(IntImm(i))
+    elif all(isinstance(i, IntImm) or isinstance(i, IntVarTensor) for i in shape):
+        normalized_shape = shape
+    else:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    return layernorm()(input_val, weight, bias, normalized_shape, eps)
+
+
+@ait_converter(acc_ops.flatten)
+def acc_ops_flatten(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    start_dim = kwargs["start_dim"] if "start_dim" in kwargs else 0
+    end_dim = kwargs["end_dim"] if "end_dim" in kwargs else -1
+
+    return flatten(start_dim=start_dim, end_dim=end_dim)(input_val)
+
+
+@ait_converter(acc_ops.matmul)
+def acc_ops_matmul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    lhs = kwargs["input"]
+    if not isinstance(lhs, AITTensor):
+        raise ValueError(f"Unexpected left operand in {name}: {lhs}")
+    # TODO: ideally we shouldn't be using _rank()/_shape()
+    lhs_shape = lhs.shape()
+    if len(lhs_shape) < 2:
+        raise ValueError(f"Not enough dims for matmul in {name}: {lhs_shape}")
+
+    rhs = kwargs["other"]
+    if not isinstance(rhs, AITTensor):
+        raise ValueError(f"Unexpected right operand in {name}: {rhs}")
+    # TODO: ideally we shouldn't be using _rank()/_shape()
+    rhs_shape = rhs.shape()
+    if len(rhs_shape) < 2:
+        raise ValueError(f"Not enough dims for matmul in {name}: {rhs_shape}")
+
+    if len(rhs_shape) == 2:
+        return gemm_rrr()(lhs, rhs)
+    elif len(lhs_shape) <= 3 and len(rhs_shape) <= 3:
+        return bmm_rrr()(lhs, rhs)
+    elif len(lhs_shape) == 4 and len(rhs_shape) == 4 and lhs_shape[1] == rhs_shape[1]:
+        assert all(isinstance(i, IntImm) for i in lhs_shape[1:])
+        assert all(isinstance(i, IntImm) for i in rhs_shape[1:])
+        # Current AIT bmm only supports 3-dim. Use reshape to workaround.
+        channel = lhs_shape[1].value()
+        M = lhs_shape[2].value()
+        K = lhs_shape[3].value()
+        N = rhs_shape[3].value()
+        if K != rhs_shape[2].value():
+            raise ValueError(
+                f"K dim mismatch on matmaul. Expected: [N, K] X [K, M]. Found: : [{M}, {K}] X [{rhs_shape[2].value()}, {N}]"
+            )
+        if isinstance(lhs_shape[0], IntImm) and (rhs_shape[0], IntImm):
+            batch_size = lhs_shape[0].value()
+            shape_0 = (batch_size * channel, M, K)
+            shape_1 = (batch_size * channel, K, N)
+            shape_2 = (batch_size, channel, M, N)
+        elif isinstance(lhs_shape[0], IntVar) and isinstance(rhs_shape[0], IntVar):
+            if lhs_shape[0] != rhs_shape[0]:
+                raise ValueError(
+                    f"Batch size mismatch on matmul. Expected: {lhs_shape[0]} == {rhs_shape[0]}"
+                )
+            lhs_size = size()(lhs)
+            new_size = getitem()(lhs_size, 0) * getitem()(lhs_size, 1)
+            shape_0 = (new_size, M, K)
+            shape_1 = (new_size, K, N)
+            shape_2 = (getitem()(lhs_size, 0), channel, M, N)
+        else:
+            raise NotImplementedError(
+                f"Expected all dimension except for the batch dim to be static. Got {lhs_shape} vs. {rhs_shape}"
+            )
+        reshape_op_0 = reshape()(lhs, shape_0)
+        reshape_op_1 = reshape()(rhs, shape_1)
+        return reshape()(bmm_rrr()(reshape_op_0, reshape_op_1), shape_2)
+    else:
+        raise NotImplementedError(
+            f"This case is unsupported in {name}: {len(lhs_shape)} and {len(rhs_shape)}"
+        )
+
+
+@ait_converter(acc_ops.chunk)
+def acc_ops_chunk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = input_val.shape()
+    target_dim = get_positive_dim(kwargs["dim"], len(shape))
+    chunks = min(kwargs["chunks"], shape[target_dim].value())
+    assert isinstance(
+        shape[target_dim], IntImm
+    ), f"Cannot perform chunk on dynamic dim! Get target dim {target_dim}."
+
+    return chunk()(input_val, chunks, dim=target_dim)
+
+
+@ait_converter(ait_acc_ops.split)
+def ait_acc_ops_split(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    split_size_or_sections = kwargs["split_size_or_sections"]
+    if not isinstance(split_size_or_sections, (int, list)):
+        raise ValueError(
+            f"Unexpected value for split_size_or_sections in {name}: {split_size_or_sections}"
+        )
+
+    if "dim" not in kwargs:
+        return split()(input_val, split_size_or_sections)
+
+    dim = kwargs["dim"]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected value for dim in {name}: {dim}")
+
+    return split()(input_val, split_size_or_sections, dim)
+
+
+@ait_converter(acc_ops.expand)
+def ait_acc_ops_expand(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    sizes = kwargs["sizes"]
+    if not sizes:
+        raise ValueError("Expand sizes cannot be empty")
+
+    def _is_int_list(iterable):
+        return all(isinstance(dim, (int, IntVar, IntVarTensor)) for dim in iterable)
+
+    # sizes can either be a single int list or a list of ints.
+    if _is_int_list(sizes):
+        shape = sizes
+    elif len(sizes) == 1 and _is_int_list(sizes[0]):
+        shape = sizes[0]
+    else:
+        raise ValueError(
+            f"sizes argument can either be many ints or single int iterable, but got: {', '.join(str(type(dim)) for dim in sizes)}"
+        )
+
+    return expand()(input_val, shape)
+
+
+@ait_converter(acc_ops.interpolate)
+def ait_acc_ops_interpolate(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    scale_factor = kwargs["scale_factor"]
+    if not scale_factor:
+        raise ValueError("scale_factor cannot be empty")
+
+    mode = kwargs["mode"]
+    if not mode:
+        raise ValueError("mode cannot be empty")
+
+    op = Upsampling2d(scale_factor=scale_factor, mode=mode)
+
+    res = op(ait_nchw2nhwc(input_val))
+    return ait_nhwc2nchw(res)
+
+
+@ait_converter(acc_ops.batch_norm)
+def acc_ops_batch_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    input_shape = input_val._attrs["shape"]
+    input_rank = len(input_shape)
+    assert 2 <= input_rank <= 5, f"expected {input_rank=} to be within [2, 5]"
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    if input_rank == 3:
+        # BatchNorm1d
+        input_val = ait_ncl2nlc(input_val)
+    elif input_rank == 4:
+        # BatchNorm2d
+        input_val = ait_nchw2nhwc(input_val)
+    elif input_rank == 5:
+        # BatchNorm3d
+        input_val = ait_ncdhw2ndhwc(input_val)
+
+    scale = elementwise(FuncEnum.DIV)(
+        kwargs["weight"],
+        elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.SQRT)(kwargs["running_var"]),
+            AITTensor(shape=[], value=kwargs["eps"]),
+        ),
+    )
+    bias = elementwise(FuncEnum.SUB)(kwargs["bias"], kwargs["running_mean"])
+
+    scale_dim_val = scale._attrs["shape"][0].value()
+
+    # input is channel-last after permute
+    input_shape = input_val._attrs["shape"]
+    channel_dim = -1
+    assert isinstance(
+        input_shape[channel_dim], IntImm
+    ), f"expected channel at {channel_dim=} in {input_shape=} to be static"
+    channel_dim_val = input_shape[channel_dim].value()
+    assert (
+        channel_dim_val == scale_dim_val
+    ), f"expected {channel_dim_val=} to be the same as {scale_dim_val=}"
+    mul_result = elementwise(FuncEnum.MUL)(input_val, scale)
+    result = elementwise(FuncEnum.ADD)(mul_result, bias)
+    if input_rank == 3:
+        # BatchNorm1d
+        result = ait_nlc2ncl(result)
+    elif input_rank == 4:
+        # BatchNorm2d
+        result = ait_nhwc2nchw(result)
+    elif input_rank == 5:
+        # BatchNorm3d
+        result = ait_ndhwc2ncdhw(result)
+    return result
+
+
+def _choose_conv2d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: AITTensor,
+    transposed: bool = False,
+) -> ConverterOutput:
+    """
+    Helper to choose conv2d vs. conv2d_bias op based on existence of bias
+    and pad channel input dim to 4/8
+    """
+    if transposed:
+        if bias:
+            return transposed_conv2d_bias(stride=stride, pad=pad, dilate=dilate)(
+                x, weight, bias
+            )
+        else:
+            return transposed_conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+    last_dim = x._attrs["shape"][-1]._attrs["values"][0]
+    # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
+    # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
+    if last_dim < 4:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
+    elif last_dim > 4 and last_dim < 8:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 8)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 8)(x)
+    elif last_dim % 2 != 0:
+        return RuntimeError(
+            f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
+        )
+    if bias:
+        return conv2d_bias(stride=stride, pad=pad, dilate=dilate)(x, weight, bias)
+    else:
+        return conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+
+@ait_converter(acc_ops.conv2d)
+def acc_ops_conv2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+    weight = weight_nchw2nhwc(weight)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = identical_elem_tuple_to_int(kwargs["stride"])
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    dilation = identical_elem_tuple_to_int(kwargs["dilation"])
+
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if kwargs["groups"] is None or kwargs["groups"] == 1:
+        result = _choose_conv2d_op(stride, padding, dilation, input_val, weight, bias)
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        groups = kwargs["groups"]
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def get_channel_dim_slice_idx(start, end, step):
+            all_none_slice = slice(None, None, None)
+            return (
+                all_none_slice,
+                all_none_slice,
+                all_none_slice,
+                slice(start, end, step),
+            )
+
+        def get_batch_dim_slice_idx(start, end, step):
+            return (slice(start, end, step),)
+
+        def make_slice(x, slice_idx, name):
+            return acc_ops_slice(
+                target,
+                args,
+                {
+                    "input": x,
+                    "idx": slice_idx,
+                },
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    get_channel_dim_slice_idx(
+                        i * group_size, i * group_size + group_size, 1
+                    ),
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.weight.slice_{i}",
+                ),
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    get_batch_dim_slice_idx(
+                        i * w_group_size, i * w_group_size + w_group_size, 1
+                    ),
+                    f"{name}.bias.slice_{i}",
+                ),
+                transposed=False,
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    result = ait_nhwc2nchw(result)
+    return result
+
+
+def _choose_conv3d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: AITTensor,
+    groups: int = 1,
+) -> ConverterOutput:
+    """
+    Helper to choose conv3d vs. depthwise_conv3d op based on existence of bias
+    and groups
+    """
+    has_bias = bias is not None
+    if groups is None or groups == 1:
+        if has_bias:
+            C_in = x.shape()[-1].value()
+
+            if 3 == C_in:
+                x = ndhwc3to8()(x)
+                weight = ndhwc3to8()(weight)
+            elif 8 != C_in:
+                raise RuntimeError(
+                    f"When having bias, conv3d currently only supports C_in == 3 or C_in == 8, but got C_in: {C_in}"
+                )
+
+            return conv3d_bias(stride=stride, pad=pad, dilate=dilate, group=1)(
+                x, weight, bias
+            )
+        else:
+            return conv3d(stride=stride, pad=pad, dilate=dilate, group=1)(x, weight)
+    elif groups == weight._attrs["shape"][0].value():
+        return depthwise_conv3d(
+            stride=stride, pad=pad, dilate=dilate, group=groups, bias=has_bias
+        )(x, weight, bias)
+    else:
+        raise RuntimeError(
+            f"Currently NOT support groups != channels when groups enabled. Got C_in: {C_in} | groups: {groups} | has bias: {has_bias}"
+        )
+
+
+@ait_converter(acc_ops.conv3d)
+def acc_ops_conv3d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = ait_ncdhw2ndhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = kwargs["weight"]
+    assert isinstance(weight, AITTensor)
+    weight = weight_ncdhw2ndhwc(weight)
+    weight._attrs["shape"] = ncdhw2ndhwc(weight._attrs["shape"])
+
+    bias = kwargs["bias"]
+    assert bias is None or isinstance(bias, AITTensor)
+
+    stride = kwargs["stride"]
+    padding = kwargs["padding"]
+    dilation = kwargs["dilation"]
+
+    groups = kwargs["groups"]
+
+    result = _choose_conv3d_op(
+        stride, padding, dilation, input_val, weight, bias, groups
+    )
+    return ait_ndhwc2ncdhw(result)
+
+
+@ait_converter(acc_ops.max_pool3d)
+def acc_ops_max_pool3d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    if isinstance(kwargs["kernel_size"], tuple) and isinstance(
+        kwargs["padding"], tuple
+    ):
+        kernel_size_tuple = kwargs["kernel_size"]
+        stride_tuple = kwargs["stride"] if kwargs["stride"] else kwargs["kernel_size"]
+        padding_tuple = kwargs["padding"]
+
+        assert kernel_size_tuple[0] == 1, "max_pool3d only supports kT == 1 currently"
+        assert stride_tuple[0] == 1, "max_pool3d only supports sT == 1 currently"
+        assert (
+            padding_tuple[0] == 0
+        ), "max_pool3d only supports T_padding == 0 currently"
+
+        kernel_size = identical_elem_tuple_to_int(kernel_size_tuple[1:])
+        stride = identical_elem_tuple_to_int(stride_tuple[1:])
+        padding = identical_elem_tuple_to_int(padding_tuple[1:])
+    elif isinstance(kwargs["kernel_size"], int) and isinstance(kwargs["padding"], int):
+        kernel_size = kwargs["kernel_size"]
+        stride = kwargs["stride"] if kwargs["stride"] else kwargs["kernel_size"]
+        padding = kwargs["padding"]
+    else:
+        raise RuntimeError("Only int or tuple types are supported")
+
+    ceil_mode = kwargs["ceil_mode"]
+    return_indices = kwargs["return_indices"]
+    if ceil_mode or return_indices:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+
+    N = input_val.shape()[0].value()
+    C = input_val.shape()[1].value()
+    D = input_val.shape()[2].value()
+    H = input_val.shape()[3].value()
+    W = input_val.shape()[4].value()
+
+    reshape_op_0 = reshape()
+    shape_0 = (N, C * D, H, W)
+    input_val = reshape_op_0(input_val, shape_0)
+
+    input_val = ait_nchw2nhwc(input_val)
+
+    output = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+    output = ait_nhwc2nchw(output)
+
+    H_o = output.shape()[2].value()
+    W_o = output.shape()[3].value()
+    reshape_op_1 = reshape()
+    shape_1 = (N, C, D, H_o, W_o)
+
+    output = reshape_op_1(output, shape_1)
+    return output
+
+
+@ait_converter(acc_ops.max_pool2d)
+def acc_ops_max_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
+    stride = (
+        identical_elem_tuple_to_int(kwargs["stride"])
+        if kwargs["stride"]
+        else kernel_size
+    )
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    ceil_mode = kwargs["ceil_mode"]
+    return_indices = kwargs["return_indices"]
+    if ceil_mode or return_indices:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+    result = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    return ait_nhwc2nchw(result)
+
+
+@ait_converter(acc_ops.avg_pool2d)
+def acc_ops_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    kernel_size = identical_elem_tuple_to_int(kwargs["kernel_size"])
+    stride = (
+        identical_elem_tuple_to_int(kwargs["stride"])
+        if kwargs["stride"]
+        else kernel_size
+    )
+    padding = identical_elem_tuple_to_int(kwargs["padding"])
+    ceil_mode = kwargs["ceil_mode"]
+    count_include_pad = kwargs["count_include_pad"]
+    divisor_override = kwargs["divisor_override"]
+    if ceil_mode or not count_include_pad or divisor_override:
+        raise RuntimeError(
+            "Non-default ceil_mode/count_include_pad/divisor_override not supported yet"
+        )
+    result = avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+    return ait_nhwc2nchw(result)
+
+
+@ait_converter(acc_ops.adaptive_avg_pool2d)
+def acc_ops_adaptive_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = ait_nchw2nhwc(kwargs["input"])
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    output_size = identical_elem_tuple_to_int(kwargs["output_size"])
+    # FIXME: try to find a way to not explose internal date like this.
+    shape = [var._attrs["values"][0] for var in input_val._attrs["shape"]]
+    HI, WI, CI = shape[1], shape[2], shape[3]
+    if CI % 2 != 0:
+        raise RuntimeError(
+            f"AIT avg_pool2d expects input channel dim to align w/ a multiple of 2 but got {CI}"
+        )
+    if HI != WI:
+        raise RuntimeError(
+            f"adaptive_avg_pool2d currently only supports square input H/W but got H: {shape[1]} and W: {shape[2]}"
+        )
+    stride = HI // output_size
+    kernel_size = HI - (output_size - 1) * stride
+
+    result = avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+    return ait_nhwc2nchw(result)
+
+
+@ait_converter(acc_ops.contiguous)
+def acc_ops_contiguous(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    return identity()(input_val)
+
+
+@ait_converter(acc_ops.to_dtype)
+@ait_converter(acc_ops.dtype)
+def acc_ops_to_dtype(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # We suppose to bypass this op but in extreme case like
+    # a = placeholder(); return a.to()
+    # It introduces a node in AIT graph which has is_input=True and is_output=True. The node name is output_xx
+    # fx2ait throws error when doing the input name binding. So we need an identity layer.
+    input_val = kwargs["input"]
+    return identity()(input_val)
+
+
+@ait_converter(acc_ops.gelu)
+def acc_ops_gelu(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    # For extra speedup, you can always lower to fast_gelu
+    if kwargs.get("approximate", None) == "tanh":
+        result = elementwise(FuncEnum.FASTGELU)(input_val)
+    else:
+        result = elementwise(FuncEnum.GELU)(input_val)
+    return result
+
+
+@ait_converter(acc_ops.pow)
+def acc_ops_pow(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    exponent = kwargs["exponent"]
+    return elementwise(FuncEnum.POW)(input_val, exponent)
+
+
+@ait_converter(acc_ops.tile)
+def acc_ops_tile(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape_dims = list(kwargs["dims"])
+    input_dim_len = len(input_val.shape())
+    result = input_val
+    if len(shape_dims) < input_dim_len:
+        for _ in range(input_dim_len - len(shape_dims)):
+            shape_dims.insert(0, 1)
+    if input_dim_len < len(shape_dims):
+        shape = input_val.shape()
+        for _ in range(len(shape_dims) - input_dim_len):
+            shape.insert(0, IntImm(1))
+        result = expand()(input_val, shape)
+
+    for i, shape in enumerate(shape_dims):
+        # Avoid operate on batch_size dim
+        if input_val.shape()[i]._attrs["name"] is not None:
+            continue
+        cat_groups = [result] * shape
+        result = concatenate()(cat_groups, dim=i)
+    return result
+
+
+@ait_converter(math.sqrt)
+def math_sqrt(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    return create_unary_op(FuncEnum.SQRT, args, kwargs, name)
+
+
+@ait_converter(acc_ops.neg)
+def acc_ops_neg(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    new_kwargs = kwargs.copy()
+    dt = new_kwargs["input"]._attrs["dtype"]
+    if dt == "float16" or dt == "float32" or dt == "bfloat16":
+        new_kwargs["other"] = float(-1)
+    elif dt == "int32" or dt == "int64":
+        new_kwargs["other"] = int(-1)
+    else:
+        raise ValueError(f"Unexpected input dtype {dt}")
+
+    return create_binary_op(FuncEnum.MUL, args, new_kwargs, name)
+
+
+@ait_converter(acc_ops.new_full)
+def acc_ops_new_full(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    size = kwargs["size"]
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
+    fill_value = kwargs["fill_value"]
+    return full()(size, fill_value=fill_value, dtype=dtype)
+
+
+@ait_converter(acc_ops.full_like)
+def acc_ops_full_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    fill_value = kwargs["fill_value"]
+    return full()(input_val.shape(), fill_value=fill_value, dtype=input_val.dtype())
+
+
+@ait_converter(acc_ops.new_ones)
+def acc_ops_new_ones(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    size = kwargs["size"]
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
+    return full()(size, 1, dtype=dtype)
+
+
+@ait_converter(acc_ops.ones_like)
+def acc_ops_ones_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    return full()(input_val.shape(), 1, dtype=input_val.dtype())
+
+
+@ait_converter(acc_ops.new_zeros)
+def acc_ops_new_zeros(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    size = kwargs["size"]
+    dtype = (
+        kwargs["dtype"]
+        if "dtype" in kwargs and kwargs["dtype"] is not None
+        else input_val.dtype()
+    )
+    return full()(size, 0, dtype=dtype)
+
+
+@ait_converter(acc_ops.zeros_like)
+def acc_ops_zeros_like(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    return full()(input_val.shape(), 0, dtype=input_val.dtype())
+
+
+@ait_converter(acc_ops.masked_select)
+def acc_ops_masked_select(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = kwargs["input"]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    mask = kwargs["mask"]
+
+    return masked_select()(input_val, mask)
diff --git a/fx2ait/fx2ait/converters/ait_module_converters.py b/fx2ait/fx2ait/converters/ait_module_converters.py
new file mode 100644
index 000000000..f2d508fd0
--- /dev/null
+++ b/fx2ait/fx2ait/converters/ait_module_converters.py
@@ -0,0 +1,119 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from collections import OrderedDict
+from typing import Any, Dict, Tuple
+
+import numpy as np
+
+import torch
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import _TorchConstantTensorData
+from aitemplate.frontend import nn
+from aitemplate.testing import detect_target
+from torch.fx.node import Argument
+
+from .ait_converters import ConverterOutput
+from .converter_registry import ait_converter
+
+USE_CUDA = detect_target().name() == "cuda"
+
+
+@ait_converter(torch.nn.modules.activation.MultiheadAttention)
+def multi_head_attention_module(
+    target: Target,
+    submod: Any,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO fix arg/kwargs matching
+    query = kwargs["query"] if "query" in kwargs else args[0]
+    key = kwargs["key"] if "key" in kwargs else args[1]
+    value = kwargs["value"] if "value" in kwargs else args[2]
+    bsz, seq_len_q, dim = query.shape()
+    _, seq_len, _ = key.shape()
+    assert (
+        submod.embed_dim % submod.num_heads == 0
+    ), f"embed_dim {submod.embed_dim} must be divisible by num_heads {submod.num_heads}"
+    head_size = submod.embed_dim // submod.num_heads
+    if head_size % 4 != 0:
+        raise ValueError(
+            f"The head size {head_size} (ie. embed_dim ({submod.embed_dim}) / num_heads ({submod.num_heads}) "
+            " must be divisible by 4. Please fix the model or consider using the complete_video_view_all_page_types preset",
+        )
+    if USE_CUDA:
+        attn = nn.CrossAttention(
+            dim=submod.embed_dim,
+            seq_len=seq_len_q.value(),
+            seq_len_kv=seq_len.value(),
+            num_heads=submod.num_heads,
+            qkv_bias=True,
+            has_residual=False,
+        )
+    else:
+        attn = nn.MultiheadAttention(
+            dim=submod.embed_dim,
+            batch_size=bsz.value(),
+            seq_len=seq_len_q.value(),
+            num_heads=submod.num_heads,
+            qkv_bias=True,
+            has_residual=False,
+            use_mem_eff=True,
+        )
+
+    # Bind constant tensor for MHA module
+    mapped_params = _map_ait_pt_params(attn, submod)
+    ait_params = dict(attn.named_parameters())
+    for name, data in mapped_params.items():
+        ait_tensor = ait_params[name].tensor()
+        ait_data = _TorchConstantTensorData(data.contiguous().cuda().half())
+        ait_tensor._bind_data(ait_data)
+
+    if "cu_length" in ait_params:
+        ait_tensor = ait_params["cu_length"].tensor()
+        cu_len = np.cumsum([0] + [seq_len.value()] * bsz.value()).astype("int32")
+        cu_len = torch.from_numpy(cu_len)
+        ait_data = _TorchConstantTensorData(cu_len.contiguous().cuda())
+        ait_tensor._bind_data(ait_data)
+
+    res = attn(query, key, value)
+    # make output of MHA a list to match the output type of pytorch MHA
+    return [res]
+
+
+def _map_ait_pt_params(ait_module, pt_module):
+    ait_params = dict(ait_module.named_parameters())
+    mapped_pt_params = OrderedDict()
+    for pt_name, pt_param in pt_module.named_parameters():
+        ait_friendly_name = (
+            pt_name.replace("in_proj", "qkv")
+            .replace("out_proj", "proj")
+            .replace("_", ".")
+        )
+        if ait_friendly_name in ait_params:
+            mapped_pt_params[ait_friendly_name] = pt_param.data
+        elif "in_proj" in pt_name:
+            # set constant for cross attention
+            if len(pt_param.shape) == 2:
+                w_q, w_k, w_v = pt_param.chunk(3)
+                mapped_pt_params["proj_q.weight"] = w_q
+                mapped_pt_params["proj_k.weight"] = w_k
+                mapped_pt_params["proj_v.weight"] = w_v
+            else:
+                b_q, b_k, b_v = pt_param.chunk(3)
+                mapped_pt_params["proj_q.bias"] = b_q
+                mapped_pt_params["proj_k.bias"] = b_k
+                mapped_pt_params["proj_v.bias"] = b_v
+    return mapped_pt_params
diff --git a/fx2ait/fx2ait/converters/aten2ait_converters.py b/fx2ait/fx2ait/converters/aten2ait_converters.py
new file mode 100644
index 000000000..49322ce7f
--- /dev/null
+++ b/fx2ait/fx2ait/converters/aten2ait_converters.py
@@ -0,0 +1,1187 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import torch  # isort:skip
+import copy
+import operator
+from typing import Dict, List, Tuple, Union
+
+import numpy
+
+from aitemplate.compiler.public import (
+    avg_pool2d,
+    bmm_rrr,
+    chunk,
+    concatenate,
+    conv2d,
+    conv2d_bias,
+    dynamic_slice,
+    elementwise,
+    expand,
+    FuncEnum,
+    gemm_rcr,
+    gemm_rcr_bias,
+    gemm_rrr,
+    getitem,
+    int_elementwise,
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    layernorm,
+    max_pool2d,
+    nhwc3to8,
+    pad_last_dim,
+    permute,
+    reduce_mean,
+    reduce_sum,
+    reshape,
+    size,
+    split,
+    squeeze,
+    Tensor as AITTensor,
+    transposed_conv2d,
+    transposed_conv2d_bias,
+    unsqueeze,
+)
+from torch.fx.node import Argument, Target
+
+from fx2ait.converters.utils import (
+    create_binary_op,
+    get_positive_dim,
+    identical_elem_tuple_to_int,
+    nchw2nhwc,
+)
+from fx2ait.passes.lower_basic_pass_aten import (
+    aten_compose_bmm_2d,
+    aten_compose_bmm_3d,
+    aten_compose_chunk,
+    aten_compose_getitem_slice,
+    aten_compose_mm_2d,
+    aten_operator_getitem,
+)
+
+from .converter_registry import ait_converter
+
+
+# Logging
+logger: logging.Logger = logging.getLogger(__name__)
+ConverterOutput = Union[AITTensor, Tuple[AITTensor, ...], List[IntVar], IntVar]
+
+## make sure the functions are place in alphabetic order
+
+
+@ait_converter(torch.ops.aten._adaptive_avg_pool2d.default)
+def aten_ops_adaptive_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    output_size = identical_elem_tuple_to_int(args[1])
+    # FIXME: try to find a way to not explose internal date like this.
+    shape = [var._attrs["values"][0] for var in input_val._attrs["shape"]]
+    HI, WI, CI = shape[1], shape[2], shape[3]
+    if CI % 2 != 0:
+        raise RuntimeError(
+            f"AIT avg_pool2d expects input channel dim to align w/ a multiple of 2 but got {CI}"
+        )
+    if HI != WI:
+        raise RuntimeError(
+            f"adaptive_avg_pool2d currently only supports square input H/W but got H: {shape[1]} and W: {shape[2]}"
+        )
+    stride = HI // output_size
+    kernel_size = HI - (output_size - 1) * stride
+
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=0)(input_val)
+
+
+@ait_converter(torch.ops.aten.avg_pool2d.default)
+def aten_ops_avg_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    kernel_size = args[1]
+    stride = args[2]
+    padding = args[3] if len(args) > 3 else 0
+    kernel_size = identical_elem_tuple_to_int(kernel_size)
+    stride = identical_elem_tuple_to_int(stride)
+    padding = identical_elem_tuple_to_int(padding)
+    return avg_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(torch.ops.aten.batch_norm)
+def aten_ops_batch_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO @qxy11: Update channels-last assumption once AIT backend is updated
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    weight = args[1]
+    bias = args[2]
+    running_mean = args[3]
+    running_var = args[4]
+    eps = args[7]
+    scale = elementwise(FuncEnum.DIV)(
+        weight,
+        elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.SQRT)(running_var),
+            AITTensor(shape=[], value=eps),
+        ),
+    )
+    running_mean = elementwise(FuncEnum.MUL)(scale, running_mean)
+    bias = elementwise(FuncEnum.SUB)(bias, running_mean)
+    matmul_result = elementwise(FuncEnum.MUL)(input_val, scale)
+    result = elementwise(FuncEnum.ADD)(matmul_result, bias)
+    return result
+
+
+@ait_converter(torch.ops.aten.add.Tensor)
+def aten_binary_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.ADD, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.div.Tensor)
+def aten_binary_ops_div(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.DIV, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.mul.Tensor)
+def aten_binary_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.MUL, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.sub.Tensor)
+def aten_binary_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    kwargs = {
+        "input": args[0],
+        "other": args[1],
+    }
+    return create_binary_op(FuncEnum.SUB, args, kwargs, name)
+
+
+@ait_converter(torch.ops.aten.cat.default)
+def aten_ops_cat(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    tensors = args[0]
+    for t in tensors:
+        if not isinstance(t, AITTensor):
+            raise ValueError(f"Non-tensor inputs for {name}: {tensors}")
+
+    dim = args[1] if len(args) > 1 else 0
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return concatenate()(tensors, dim=dim)
+
+
+def _choose_conv2d_op(
+    stride: int,
+    pad: int,
+    dilate: int,
+    x: AITTensor,
+    weight: AITTensor,
+    bias: [AITTensor],
+    transposed: [bool] = False,
+) -> ConverterOutput:
+    """
+    Helper to choose conv2d vs. conv2d_bias op based on existence of bias
+    and pad channel input dim to 4/8
+    """
+    if transposed:
+        if bias:
+            return transposed_conv2d_bias(stride=stride, pad=pad, dilate=dilate)(
+                x, weight, bias
+            )
+        else:
+            return transposed_conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+    last_dim = x._attrs["shape"][-1]._attrs["values"][0]
+    # CUDA conv channel dim weights need to align w/ a multiple of 2/4/8
+    # if CI < 4, pad to 4; if 5 < CI < 8, pad to 8;
+    if last_dim < 4:
+        weight = pad_last_dim(len(weight._attrs["shape"]), 4)(weight)
+        x = pad_last_dim(len(x._attrs["shape"]), 4)(x)
+    elif last_dim in range(5, 8):
+        to_8 = nhwc3to8()
+        weight = to_8(weight)
+        x = to_8(x)
+    elif last_dim % 2 != 0:
+        return RuntimeError(
+            f"Conv2d is not implemented for input channel dim {last_dim}: it needs to be aligned to a multiple of 2/4/8"
+        )
+    if bias:
+        return conv2d_bias(stride=stride, pad=pad, dilate=dilate)(x, weight, bias)
+    else:
+        return conv2d(stride=stride, pad=pad, dilate=dilate)(x, weight)
+
+
+@ait_converter(torch.ops.aten.convolution.default)
+def aten_ops_conv2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: qxy11: Update once channels-first format is supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    weight = args[1]
+    assert isinstance(weight, AITTensor)
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    weight._attrs["shape"] = nchw2nhwc(weight._attrs["shape"])
+
+    bias = args[2]
+    if not (isinstance(bias, AITTensor) or bias is None):
+        raise RuntimeError(f"Non-tensor weight for {name}: {bias}")
+
+    stride = args[3]
+    stride = identical_elem_tuple_to_int(stride)
+    padding = args[4]
+    padding = identical_elem_tuple_to_int(padding)
+    dilation = args[5]
+    dilation = identical_elem_tuple_to_int(dilation)
+    transposed = args[6]
+    # output_padding = args[7]
+    groups = args[8]
+
+    assert all(
+        isinstance(x, int) for x in [stride, padding, dilation]
+    ), "Expected int stride, padding, and dilation"
+
+    if groups is None or groups == 1:
+        if transposed:
+            if bias:
+                result = transposed_conv2d_bias(
+                    stride=stride, pad=padding, dilate=dilation
+                )(input_val, weight, bias)
+            else:
+                result = transposed_conv2d(stride=stride, pad=padding, dilate=dilation)(
+                    input_val, weight
+                )
+        else:
+            result = _choose_conv2d_op(
+                stride, padding, dilation, input_val, weight, bias, transposed
+            )
+    else:
+        # Grouped conv doesn't currently work on AIT CUDA, manually map
+        group_size = input_val.shape()[3]._attrs["values"][0] // groups
+        w_group_size = weight.shape()[0]._attrs["values"][0] // groups
+
+        def make_slice(x, dim, start, end, step, name):
+            args = []
+            args.append(x)
+            args.append(dim)
+            args.append(start)
+            args.append(end)
+            args.append(step)
+            return aten_ops_slice(
+                target,
+                args,
+                None,
+                name,
+            )
+
+        conv_groups = [
+            _choose_conv2d_op(
+                stride,
+                padding,
+                dilation,
+                make_slice(  # input_val[:,:,:,gs*i:gs*i + gs]
+                    input_val,
+                    3,
+                    i * group_size,
+                    i * group_size + group_size,
+                    1,
+                    f"{name}.slice_{i}",
+                ),
+                make_slice(  # weights[wgs*i:wgs*i + wgs,]
+                    weight,
+                    0,
+                    i * w_group_size,
+                    i * w_group_size + w_group_size,
+                    1,
+                    f"{name}.weight.slice_{i}",
+                ),
+                None
+                if bias is None
+                else make_slice(  # bias[wgs*i:wgs*i + wgs,]
+                    bias,
+                    0,
+                    i * w_group_size,
+                    i * w_group_size + w_group_size,
+                    1,
+                    f"{name}.bias.slice_{i}",
+                ),
+                transposed=transposed,
+            )
+            for i in range(groups)
+        ]
+        result = concatenate()(conv_groups, dim=3)
+
+    return result
+
+
+@ait_converter(torch.ops.aten.clone.default)
+def aten_unary_ops_clone(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    res = copy.deepcopy(input_val)
+    res._attrs["dst_ops"].clear()
+    return res
+
+
+@ait_converter(torch.ops.aten.cos.default)
+def aten_unary_ops_cos(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.COS)(input_val)
+
+
+@ait_converter(aten_compose_chunk)
+@ait_converter(torch.ops.aten.chunk.default)
+def aten_ops_chunk(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = input_val.shape()
+    target_dim = get_positive_dim(args[2], len(shape))
+    chunks = min(args[1], shape[target_dim].value())
+    assert isinstance(
+        shape[target_dim], IntImm
+    ), f"Cannot perform chunk on dynamic dim! Get target dim {target_dim}."
+
+    return chunk()(input_val, chunks, dim=target_dim)
+
+
+@ait_converter(torch.ops.aten.expand.default)
+def aten_ops_expand(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO expand is not functional yet but only for cases with dim=-1
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    sizes = args[1]
+    if not sizes:
+        raise ValueError("Expand sizes cannot be empty")
+
+    def _is_int_list(iterable):
+        return all(isinstance(dim, (int, IntVar, IntVarTensor)) for dim in iterable)
+
+    # sizes can either be a single int list or a list of ints.
+    if _is_int_list(sizes):
+        shape = sizes
+    elif len(sizes) == 1 and _is_int_list(sizes[0]):
+        shape = sizes[0]
+    else:
+        raise ValueError(
+            f"sizes argument can either be many ints or single int iterable, but got: {', '.join(str(type(dim)) for dim in sizes)}"
+        )
+
+    return expand()(input_val, shape)
+
+
+@ait_converter(aten_operator_getitem)
+def aten_ops_getitem(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    idx = args[1]
+
+    # This case is decomposed into a few slice.Tensor ops
+    # if (
+    #     isinstance(idx, slice)
+    #     or (isinstance(idx, Sequence) and any(isinstance(x, slice) for x in idx))
+    #     or isinstance(input_val, AITTensor)
+    # ):
+    #     return aten_ops_slice(target, args, kwargs, name)
+
+    if isinstance(idx, int):
+        idx = get_positive_dim(idx, len(input_val))
+
+    if all(isinstance(i, IntImm) for i in input_val):
+        return operator.getitem(input_val, idx)
+    else:
+        return getitem()(input_val, idx)
+
+
+@ait_converter(torch.ops.aten.layer_norm.default)
+def aten_ops_layer_norm(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    shape = args[1]
+    if shape is None or len(shape) == 0:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    weight = args[2]
+    bias = args[3]
+    normalized_shape = []
+    if all(isinstance(i, int) for i in shape):
+        for i in shape:
+            normalized_shape.append(IntImm(i))
+    elif all(isinstance(i, IntImm) or isinstance(i, IntVarTensor) for i in shape):
+        normalized_shape = shape
+    else:
+        raise ValueError(f"Unexpected normalized shape value in {name}: {shape}")
+    return layernorm()(input_val, weight, bias, normalized_shape)
+
+
+@ait_converter(torch.ops.aten.linear)
+def aten_ops_linear(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    weight = args[1]
+    bias = args[2]
+
+    assert isinstance(weight, AITTensor)
+    if bias is None:
+        return gemm_rcr()(input_val, weight)
+    else:
+        return gemm_rcr_bias()(input_val, weight, bias)
+
+
+@ait_converter(torch.ops.aten.max_pool2d)
+def aten_ops_max_pool2d(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: @qxy11 Update once NCHW supported
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+    kernel_size = args[1]
+    stride = args[2]
+    padding = args[3] if len(args) > 3 else 0
+    kernel_size = identical_elem_tuple_to_int(kernel_size)
+    stride = identical_elem_tuple_to_int(stride)
+    padding = identical_elem_tuple_to_int(padding)
+    return max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(input_val)
+
+
+@ait_converter(aten_compose_mm_2d)
+@ait_converter(aten_compose_bmm_3d)
+@ait_converter(aten_compose_bmm_2d)
+@ait_converter(torch.ops.aten.addmm.default)
+@ait_converter(torch.ops.aten.mm.default)
+@ait_converter(torch.ops.aten.bmm.default)
+def aten_ops_matmul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    if len(args) > 2:
+        bias = args[0]
+        input_val = args[1]
+        weight = args[2]
+    else:
+        bias = None
+        input_val = args[0]
+        weight = args[1]
+
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    if not isinstance(weight, AITTensor):
+        raise ValueError(f"Unexpected weight for {name}: {weight}")
+
+    input_shape = input_val.shape()
+    weight_shape = weight.shape()
+    if len(weight_shape) == 2:
+        result = gemm_rrr()(input_val, weight)
+    elif len(input_shape) == 3 and len(weight_shape) == 3:
+        result = bmm_rrr()(input_val, weight)
+    else:
+        raise NotImplementedError(
+            f"This case is unsupported in {name}: {len(input_shape)} and {len(weight_shape)}"
+        )
+
+    if bias is not None:
+        if not isinstance(bias, AITTensor):
+            raise ValueError(f"Unexpected weight for {name}: {bias}")
+        result = elementwise(FuncEnum.ADD)(result, bias)
+
+    return result
+
+
+@ait_converter(torch.ops.aten.mean.dim)
+def aten_ops_mean(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    dims = args[1]
+    keepdim = args[2] if len(args) > 2 else False
+    if len(dims) == 1:
+        return reduce_mean(dim=dims, keepdim=keepdim)(input_val)
+    else:
+        new_dims = list(dims)
+        res = input_val
+        for i, d in enumerate(new_dims):
+            if d < 0:
+                d += len(input_val.shape())
+            new_dims[i] = d
+            res = reduce_mean(dim=d, keepdim=True)(res)
+        if not keepdim:
+            new_dims = sorted(new_dims, reverse=True)
+            for d in new_dims:
+                res = squeeze(d)(res)
+        return res
+
+
+@ait_converter(torch.ops.aten.nan_to_num.default)
+def aten_ops_nan_to_num(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Non-tensor inputs for {name}: {input_val}")
+
+    nan = args[1] if len(args) > 1 else None
+    nan = 0 if nan is None else nan
+    posinf = args[2] if len(args) > 2 else None
+    posinf = numpy.finfo(numpy.float16).max if posinf is None else posinf
+    neginf = args[3] if len(args) > 3 else None
+    neginf = numpy.finfo(numpy.float16).min if neginf is None else neginf
+    return elementwise(FuncEnum.NAN_TO_NUM)(
+        input_val,
+        AITTensor(value=nan, shape=[], name="nan"),
+        AITTensor(value=posinf, shape=[], name="posinf"),
+        AITTensor(value=neginf, shape=[], name="neginf"),
+    )
+
+
+@ait_converter(torch.ops.aten.split_with_sizes.default)
+@ait_converter(torch.ops.aten.split.Tensor)
+def aten_ops_split(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Non-tensor inputs for {name}: {input_val}")
+
+    split_size_or_sections = args[1]
+    # TODO split_size_or_sections can be IntVar and AIT does not support yet
+    # if not isinstance(split_size_or_sections, (int, list)):
+    #     raise ValueError(
+    #         f"Unexpected value for split_size_or_sections in {name}: {split_size_or_sections}"
+    #     )
+    dim = args[2] if len(args) > 2 else 0
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected value for dim in {name}: {dim}")
+
+    return split()(input_val, split_size_or_sections, dim)
+
+
+@ait_converter(torch.ops.aten.sym_numel)
+def aten_ops_numel(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape = size()(input_val)
+    res = shape[0]
+    for ind, dim in enumerate(shape):
+        if ind != 0:
+            res = int_elementwise(FuncEnum.MUL)(res, dim)
+    return res
+
+
+@ait_converter(torch.ops.aten.permute.default)
+def aten_ops_permute(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    permutation = args[1]
+    if len(permutation) > 5:
+        raise RuntimeError(f"Unsupported permutation {permutation} for {input_val}")
+
+    return permute()(input_val, permutation)
+
+
+@ait_converter(torch.ops.aten.pow.Tensor_Scalar)
+def aten_ops_pow(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    exp = args[1]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.POW)(input_val, exp)
+
+
+@ait_converter(torch.ops.aten.relu.default)
+def aten_ops_relu(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.RELU)(input_val)
+
+
+@ait_converter(torch.ops.aten.reshape)
+@ait_converter(torch.ops.aten.view.default)
+def aten_ops_reshape(
+    target: Target, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    shape = args[1]
+    new_shape = []
+    for s in shape:
+        if isinstance(s, IntVarTensor) or s == -1:
+            new_shape.append(s)
+        elif isinstance(s, int):
+            new_shape.append(IntVarTensor(IntImm(s)))
+        else:
+            raise RuntimeError(f"Unexpected shape type for {name}: {s} in {shape}")
+
+    if new_shape.count(-1):
+        assert new_shape.count(-1) == 1
+        input_shape = size()(input_val)
+        unkown_dim = input_shape[0]
+        for i in range(1, len(input_shape)):
+            unkown_dim = unkown_dim * input_shape[i]
+        idx = new_shape.index(-1)
+
+        for s in new_shape:
+            if s != -1:
+                unkown_dim = unkown_dim / s
+        new_shape[idx] = unkown_dim
+
+    return reshape()(input_val, new_shape)
+
+
+@ait_converter(torch.ops.aten.sym_size)
+def aten_ops_size(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    dim = args[1]
+    return size()(input_val, dim)
+
+
+@ait_converter(aten_compose_getitem_slice)
+@ait_converter(torch.ops.aten.slice.Tensor)
+@ait_converter(torch.ops.aten.select.int)
+def aten_ops_slice(  # noqa: C901
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    idx = []
+    if target == aten_compose_getitem_slice:
+        for sli in args[1]:
+            start = sli[1]
+            end = sli[2] if len(sli) >= 3 else start + 1
+            if end == 9223372036854775807:  # represents None in pt2 tracer
+                end = None
+            step = sli[3] if len(sli) >= 4 else None
+            idx.append(slice(start, end, step))
+    else:
+        dim = args[1]
+        start = args[2]
+        end = args[3] if len(args) > 3 else start + 1
+        if end == 9223372036854775807:  # represents None in pt2 tracer
+            end = None
+        step = args[4] if len(args) > 4 else None
+        for _ in range(0, dim):
+            idx.append(slice(None, None, None))
+        if len(args) > 3:
+            idx.append(slice(start, end, step))
+        else:
+            idx.append(start)
+
+    if isinstance(input_val, (tuple, list)):
+        return operator.getitem(input_val, idx)
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    rank = input_val._rank()
+    op = dynamic_slice()
+
+    def num_slice_types(slices):
+        return sum(1 for s in slices if isinstance(s, slice) or isinstance(s, int))
+
+    # Replace ellipsis with expand slices.
+    num_ellipsis = rank - num_slice_types(idx)
+    expand_idx = []
+    for i in idx:
+        if i == Ellipsis:
+            # pass explicit start to guard against negative num_ellipsis
+            for _ in range(0, num_ellipsis):
+                expand_idx.append(slice(None, None, None))
+        else:
+            expand_idx.append(i)
+    idx = expand_idx
+
+    # Record indices that need to be either:
+    #   (1) sequeezed if Slice-index is of int; or
+    #   (2) unsqueezed if Slice-index is of None
+    # Each element of the list is a tuple of (int, func), where the second item
+    # is either squeeze or unsqueeze function and the first
+    # item gives the index to be squeezed or unsqueezed.
+    squeezable_indices = []
+    # the number of the indices of type None
+    num_none_indices = 0
+    start, end = [], []
+    for index, i in enumerate(idx):
+        if i is None:
+            squeezable_indices.append((index, unsqueeze))
+            num_none_indices += 1
+            continue
+        if isinstance(i, int):
+            i = get_positive_dim(i, input_val.shape()[index].value())
+            # If we pass an int, we need to squeeze this dim.
+            # Note that because we skip None-indices before, so we adjust
+            # the index by subtracting the number of None-indices.
+            squeezable_indices.append((index - num_none_indices, squeeze))
+        # if idx is slice, AIT only support slice.step == 1
+        # TODO remove check once slice support step != 1
+        if isinstance(i, slice) and i.step not in (1, None):
+            raise ValueError(
+                f"Slice tensor only support step=1 case, get step={i.step}."
+            )
+        start.append(i.start if isinstance(i, slice) else i)
+        end.append(i.stop if isinstance(i, slice) else (i + 1 if i is not None else i))
+
+    # append hiden dim at end
+    while len(start) < rank:
+        start.append(0)
+        end.append(None)
+    output = op(input_val, start, end)
+    for dim, squeeze_func in reversed(squeezable_indices):
+        output = squeeze_func(dim)(output)
+    return output
+
+
+@ait_converter(torch.ops.aten.squeeze)
+@ait_converter(torch.ops.aten.squeeze.default)
+@ait_converter(torch.ops.aten.squeeze.dim)
+def aten_ops_squeeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = args[1] if len(args) > 1 else None
+    if not isinstance(dim, int) and dim is not None:
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return squeeze(dim)(input_val)
+
+
+@ait_converter(torch.ops.aten.sum.dim_IntList)
+def aten_ops_sum(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    dims = args[1]
+    keepdim = args[2] if len(args) > 2 else False
+
+    if len(dims) == 1:
+        return reduce_sum(dim=dims, keepdim=keepdim)(input_val)
+    else:
+        new_dims = list(dims)
+        res = input_val
+        for i, d in enumerate(new_dims):
+            if d < 0:
+                d += len(input_val.shape())
+            new_dims[i] = d
+            res = reduce_sum(dim=d, keepdim=True)(res)
+        if not keepdim:
+            new_dims = sorted(new_dims, reverse=True)
+            for d in new_dims:
+                res = squeeze(d)(res)
+        return res
+
+
+@ait_converter(torch.ops.aten.hardtanh.default)
+def aten_ops_hardtanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    result = input_val
+    minimal = args[1] if len(args) > 1 else -1
+    maximum = args[2] if len(args) > 2 else 1
+    if minimal is not None:
+        result = elementwise(FuncEnum.MAX)(result, AITTensor(value=minimal, shape=[]))
+    if maximum is not None:
+        result = elementwise(FuncEnum.MIN)(result, AITTensor(value=maximum, shape=[]))
+    return result
+
+
+@ait_converter(torch.ops.aten.t.default)
+def aten_ops_transpose(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    # TODO: we will also support https://pytorch.org/docs/stable/generated/torch.transpose.html in the future
+    # Be careful. Transpose is expensive, so we want to avoid it.
+    input_val = args[0]
+    permutation = [0, 2, 1]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+    input_3d = unsqueeze(0)(input_val)
+    input_3d = permute()(input_3d, permutation)
+    input_2d = squeeze(0)(input_3d)
+    return input_2d
+
+
+@ait_converter(torch.ops.aten.unsqueeze.default)
+def aten_ops_unsqueeze(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise ValueError(f"Unexpected input for {name}: {input_val}")
+
+    dim = args[1]
+    if not isinstance(dim, int):
+        raise ValueError(f"Unexpected {type(dim)} dim for {name}: {dim}")
+
+    return unsqueeze(dim)(input_val)
+
+
+## operator for symbolic computation
+@ait_converter(operator.mul)
+def operator_ops_mul(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.MUL)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.add)
+def operator_ops_add(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.ADD)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.sub)
+def operator_ops_sub(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.SUB)(input_val, other_val)
+    return res
+
+
+@ait_converter(operator.floordiv)
+def operator_ops_floordiv(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    other_val = args[1]
+    if not isinstance(input_val, IntVarTensor):
+        if isinstance(input_val, int):
+            input_val = IntVarTensor(IntImm(input_val))
+        else:
+            raise ValueError(f"Unexpected input type for {name}: {input_val}")
+    if not isinstance(other_val, IntVarTensor):
+        if isinstance(other_val, int):
+            other_val = IntVarTensor(IntImm(other_val))
+        else:
+            raise ValueError(f"Unexpected other input type for {name}: {other_val}")
+    res = int_elementwise(FuncEnum.DIV)(input_val, other_val)
+    return res
+
+
+@ait_converter(torch.ops.aten.abs.default)
+def aten_unary_ops_abs(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.ABS)(input_val)
+
+
+@ait_converter(torch.ops.aten.clamp.default)
+def aten_unary_ops_clamp(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    result = input_val
+    minimal = args[1]
+    maximum = args[2] if len(args) > 2 else None
+    if minimal is not None:
+        result = elementwise(FuncEnum.MAX)(result, AITTensor(value=minimal, shape=[]))
+    if maximum is not None:
+        result = elementwise(FuncEnum.MIN)(result, AITTensor(value=maximum, shape=[]))
+    return result
+
+
+@ait_converter(torch.ops.aten.log.default)
+def aten_unary_ops_log(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.LOGE)(input_val)
+
+
+@ait_converter(torch.ops.aten.sigmoid.default)
+def aten_unary_ops_sigmoid(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+    return elementwise(FuncEnum.SIGMOID)(input_val)
+
+
+@ait_converter(torch.ops.aten.sign.default)
+def aten_unary_ops_sign(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    if not isinstance(input_val, AITTensor):
+        raise RuntimeError(f"Unexpected input for {name}: {input_val}")
+
+    return elementwise(FuncEnum.SIGN)(input_val)
+
+
+@ait_converter(torch.ops.aten.sin.default)
+def aten_unary_ops_sin(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.SIN)(input_val)
+
+
+@ait_converter(torch.ops.aten.sqrt.default)
+def aten_unary_ops_sqrt(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.SQRT)(input_val)
+
+
+@ait_converter(torch.ops.aten.tanh.default)
+def aten_unary_ops_tanh(
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> ConverterOutput:
+    input_val = args[0]
+    return elementwise(FuncEnum.TANH)(input_val)
diff --git a/fx2ait/fx2ait/converters/converter_registry.py b/fx2ait/fx2ait/converters/converter_registry.py
new file mode 100644
index 000000000..11663efa0
--- /dev/null
+++ b/fx2ait/fx2ait/converters/converter_registry.py
@@ -0,0 +1,33 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Any, Callable, Dict
+
+from torch.fx.node import Target
+
+AIT_CONVERTERS: Dict[Target, Any] = {}
+
+
+def ait_converter(key: Target, enabled: bool = True) -> Callable[[Any], Any]:
+    def register_converter(converter):
+        AIT_CONVERTERS[key] = converter
+        return converter
+
+    def disable_converter(converter):
+        return converter
+
+    if enabled:
+        return register_converter
+    else:
+        return disable_converter
diff --git a/fx2ait/fx2ait/converters/utils.py b/fx2ait/fx2ait/converters/utils.py
new file mode 100644
index 000000000..c94f62102
--- /dev/null
+++ b/fx2ait/fx2ait/converters/utils.py
@@ -0,0 +1,205 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import math
+import operator
+from typing import Any, Callable, Dict, List, Tuple, Union
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
+
+from aitemplate.compiler.public import (
+    elementwise,
+    FuncEnum,
+    int_elementwise,
+    permute,
+    Tensor as AITTensor,
+)
+from torch.fx.node import Argument
+
+OPS_FOLLOW_PT_TENSOR_LAYOUT = True
+
+
+def set_tensor_layout_policy(follow_pt_layout: bool):
+    global OPS_FOLLOW_PT_TENSOR_LAYOUT
+    OPS_FOLLOW_PT_TENSOR_LAYOUT = follow_pt_layout
+
+
+def get_positive_dim(dim: int, dim_size: int) -> int:
+    if dim < 0:
+        return dim % dim_size
+    return dim
+
+
+def create_reduce_op(
+    op_type: Any, args: Tuple[Argument, ...], kwargs: Dict[str, Argument], name: str
+) -> AITTensor:
+    input_val = kwargs["input"]
+    # TODO: remove once multiple reduction axes are supported
+    dims = kwargs.get("dim", None)
+    if dims is None:
+        dims = list(range(len(input_val.shape())))
+    if len(dims) < 1:
+        raise ValueError("No dims to reduce on")
+    dim = dims[0]
+    keepdim = False if "keepdim" not in kwargs else kwargs["keepdim"]
+    sum_val = op_type(dim=dim, keepdim=keepdim)(input_val)
+
+    if len(dims) > 1:
+        new_kwargs = {"input": sum_val, "dims": dims[1:]}
+        return create_reduce_op(op_type, args, new_kwargs, name)
+
+    return sum_val
+
+
+def create_binary_op(
+    op_type: FuncEnum,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> AITTensor:
+    lhs = kwargs["input"]
+    if not isinstance(lhs, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected left operand {type(lhs)} on {name}: {lhs}")
+
+    rhs = kwargs["other"]
+    if not isinstance(rhs, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected right operand {type(rhs)} on {name}: {rhs}")
+
+    lhs_is_constant, lhs_constant = try_get_constant_num(lhs)
+    rhs_is_constant, rhs_constant = try_get_constant_num(rhs)
+    if lhs_is_constant and rhs_is_constant:
+        res = get_python_op_from_ait_constant_elementwise_op(op_type)(
+            lhs_constant, rhs_constant
+        )
+        return res
+
+    if isinstance(lhs, IntVarTensor) or isinstance(rhs, IntVarTensor):
+        lhs = IntVarTensor(IntImm(lhs)) if isinstance(lhs, int) else lhs
+        rhs = IntVarTensor(IntImm(rhs)) if isinstance(rhs, int) else rhs
+
+        if not (isinstance(lhs, IntVarTensor) and isinstance(rhs, IntVarTensor)):
+            raise RuntimeError(f"Unexpected right operand {type(rhs)} on {name}: {rhs}")
+
+        return int_elementwise(op_type)(lhs, rhs)
+    return elementwise(op_type)(lhs, rhs)
+
+
+def create_unary_op(
+    op_type: FuncEnum,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> AITTensor:
+    input = kwargs["input"] if "input" in kwargs else args[0]
+    if not isinstance(input, (AITTensor, float, int)):
+        raise RuntimeError(f"Unexpected left operand {type(input)} on {name}: {input}")
+
+    input_is_constant, input_constant = try_get_constant_num(input)
+    if input_is_constant:
+        res = get_python_op_from_ait_constant_elementwise_op(op_type)(input_constant)
+        return res
+
+    return elementwise(op_type)(input)
+
+
+def try_get_constant_num(arg: Any) -> (bool, Any):
+    if isinstance(arg, (float, int)):
+        return (True, arg)
+    elif isinstance(arg, IntImm):
+        return (True, arg.value())
+    elif isinstance(arg, IntVarTensor):
+        var = arg._attrs["int_var"]
+        return try_get_constant_num(var)
+    else:
+        return (False, None)
+
+
+def get_python_op_from_ait_constant_elementwise_op(
+    op_type: FuncEnum,
+) -> Callable[[Any, Any], Any]:
+    if op_type == FuncEnum.ADD:
+        return operator.add
+    elif op_type == FuncEnum.MUL:
+        return operator.mul
+    elif op_type == FuncEnum.SUB:
+        return operator.sub
+    elif op_type == FuncEnum.DIV:
+        return operator.truediv
+    elif op_type == FuncEnum.SQRT:
+        return math.sqrt
+    elif op_type == FuncEnum.FLOOR_DIV:
+        return operator.floordiv
+    else:
+        raise RuntimeError(f"{op_type} is not supported yet!")
+
+
+def identical_elem_tuple_to_int(param):
+    """
+    Convert tuples with all the same int elem to
+    a single int (ex. (3, 3, 3) --> 3)
+    """
+    if isinstance(param, int):
+        return param
+
+    if not isinstance(param, (list, tuple)) or not all(x == param[0] for x in param):
+        raise RuntimeError(f"AIT supports square param values only, but got {param}")
+    return param[0]
+
+
+def nchw2nhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
+    return [shape[0], shape[2], shape[3], shape[1]]
+
+
+def ncdhw2ndhwc(shape: List[Union[int, IntVar]]) -> List[Union[int, IntVar]]:
+    return [shape[0], shape[2], shape[3], shape[4], shape[1]]
+
+
+def weight_nchw2nhwc(weight: AITTensor) -> None:
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 1)
+    return weight
+
+
+def weight_ncdhw2ndhwc(weight: AITTensor) -> None:
+    weight._attrs["data"].tensor = weight._attrs["data"].tensor.permute(0, 2, 3, 4, 1)
+    return weight
+
+
+def ait_ncl2nlc(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 1])
+
+
+def ait_nlc2ncl(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 1])
+
+
+def ait_nchw2nhwc(ait_tensor: AITTensor) -> AITTensor:
+    if OPS_FOLLOW_PT_TENSOR_LAYOUT:
+        return permute()(ait_tensor, [0, 2, 3, 1])
+    else:
+        return ait_tensor
+
+
+def ait_nhwc2nchw(ait_tensor: AITTensor) -> AITTensor:
+    if OPS_FOLLOW_PT_TENSOR_LAYOUT:
+        return permute()(ait_tensor, [0, 3, 1, 2])
+    else:
+        return ait_tensor
+
+
+def ait_ncdhw2ndhwc(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 2, 3, 4, 1])
+
+
+def ait_ndhwc2ncdhw(ait_tensor: AITTensor) -> AITTensor:
+    return permute()(ait_tensor, [0, 4, 1, 2, 3])
diff --git a/fx2ait/fx2ait/csrc/AITModel.cpp b/fx2ait/fx2ait/csrc/AITModel.cpp
new file mode 100644
index 000000000..eed7855c4
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModel.cpp
@@ -0,0 +1,118 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "AITModel.h"
+
+#include "picojson.h"
+
+namespace torch::aitemplate {
+// const string for serialization
+const static std::string LIB_BASENAME_STR = "library_basename";
+const static std::string INPUT_NAMES_STR = "input_names";
+const static std::string OUTPUT_NAMES_STR = "output_names";
+const static std::string FLOATING_POINT_INPUT_DTYPE_STR =
+    "floating_point_input_dtype";
+const static std::string FLOATING_POINT_OUTPUT_DTYPE_STR =
+    "floating_point_output_dtype";
+std::string AITModel::serialize() const {
+  std::string result;
+  picojson::object var;
+  picojson::array pick_input_names;
+  var[LIB_BASENAME_STR] = picojson::value(aitModelImpl_.libraryBasename());
+  for (const auto& entry : aitModelImpl_.inputNames()) {
+    pick_input_names.push_back(picojson::value(entry));
+  }
+  var[INPUT_NAMES_STR] = picojson::value(pick_input_names);
+  picojson::array pick_output_names;
+  for (const auto& entry : aitModelImpl_.outputNames()) {
+    pick_output_names.push_back(picojson::value(entry));
+  }
+  var[OUTPUT_NAMES_STR] = picojson::value(pick_output_names);
+  var[FLOATING_POINT_INPUT_DTYPE_STR] = picojson::value(std::to_string(
+      static_cast<int16_t>(aitModelImpl_.floatingPointInputDtype().value())));
+
+  var[FLOATING_POINT_OUTPUT_DTYPE_STR] = picojson::value(std::to_string(
+      static_cast<int16_t>(aitModelImpl_.floatingPointOutputDtype().value())));
+
+  result = picojson::value(var).serialize();
+  return result;
+}
+
+void AITModel::loadAsTorchClass() {
+  // Calling this function will make sure that the static content of this file
+  // will be executed. I.e. the most important part here is registering the
+  // AITModel class for Python environment (i.e. torch::deploy).
+  LOG(INFO) << "Making sure AITModel is registered via torch::class_";
+}
+
+static auto registerAITModel =
+    torch::class_<AITModel>("ait", "AITModel")
+        .def(torch::init<
+             std::string,
+             std::vector<std::string>,
+             std::vector<std::string>,
+             c10::optional<at::ScalarType>,
+             c10::optional<at::ScalarType>,
+             int64_t>())
+        .def("forward", &AITModel::forward)
+        .def("profile", &AITModel::profile)
+        .def("get_library_path", &AITModel::libraryPath)
+        .def_property(
+            "use_cuda_graph",
+            &AITModel::getUseCudaGraph,
+            &AITModel::setUseCudaGraph)
+        .def_static(
+            "register_library_name_to_path_map",
+            [](c10::Dict<std::string, std::string> dict) {
+              std::unordered_map<std::string, std::string> map;
+              for (const auto& entry : dict) {
+                map[entry.key()] = entry.value();
+              }
+              AITModelImpl::registerLibraryNameToPathMap(std::move(map));
+            })
+        .def_pickle(
+            [](const c10::intrusive_ptr<AITModel>& self) -> std::string {
+              return self->serialize();
+            },
+            [](const std::string& data) {
+              picojson::value var;
+              const char* json = data.c_str();
+              picojson::parse(var, json, json + strlen(json));
+              std::vector<std::string> input_names;
+              for (const auto name :
+                   var.get(INPUT_NAMES_STR).get<picojson::array>()) {
+                input_names.push_back(name.get<std::string>());
+              }
+              std::vector<std::string> output_names;
+              for (const auto name :
+                   var.get(OUTPUT_NAMES_STR).get<picojson::array>()) {
+                output_names.push_back(name.get<std::string>());
+              }
+              auto floating_point_input_dtype =
+                  std::stoi(var.get(FLOATING_POINT_INPUT_DTYPE_STR)
+                                .get<std::string>()
+                                .c_str());
+              auto floating_point_output_dtype =
+                  std::stoi(var.get(FLOATING_POINT_OUTPUT_DTYPE_STR)
+                                .get<std::string>()
+                                .c_str());
+              return c10::make_intrusive<AITModel>(
+                  AITModelImpl::getFullPathForLibraryName(
+                      var.get(LIB_BASENAME_STR).get<std::string>().c_str()),
+                  input_names,
+                  output_names,
+                  static_cast<at::ScalarType>(floating_point_input_dtype),
+                  static_cast<at::ScalarType>(floating_point_output_dtype));
+            });
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModel.h b/fx2ait/fx2ait/csrc/AITModel.h
new file mode 100644
index 000000000..4780b7ed1
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModel.h
@@ -0,0 +1,78 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include <torch/torch.h> // @manual=//caffe2:torch-cpp
+#include "AITModelImpl.h"
+
+namespace torch::aitemplate {
+
+class AITModel : public torch::CustomClassHolder {
+ public:
+  explicit AITModel(
+      const std::string& model_path,
+      std::vector<std::string> input_names,
+      std::vector<std::string> output_names,
+      c10::optional<at::ScalarType> input_dtype,
+      c10::optional<at::ScalarType> output_dtype,
+      int64_t num_runtimes = 2,
+      bool use_cuda_graph = false)
+      : aitModelImpl_(
+            model_path,
+            input_names,
+            output_names,
+            input_dtype,
+            output_dtype) {}
+
+  ~AITModel() {}
+
+  // If we need to move or copy this object, then we should just
+  // define a unique_ptr with deleter for the handle.
+  AITModel(const AITModel&) = delete;
+  AITModel& operator=(const AITModel&) = delete;
+
+  std::vector<torch::Tensor> forward(std::vector<torch::Tensor> inputs) {
+    return aitModelImpl_.forward(inputs);
+  }
+
+  void profile(
+      std::vector<torch::Tensor> inputs,
+      const std::string& filename,
+      int64_t num_iters) {
+    TORCH_CHECK_GE(num_iters, 0);
+    aitModelImpl_.profile(inputs, filename, static_cast<size_t>(num_iters));
+  }
+
+  const std::string& libraryPath() const {
+    return aitModelImpl_.libraryPath();
+  }
+
+  void setUseCudaGraph(bool use_cuda_graph) {
+    aitModelImpl_.setUseCudaGraph(use_cuda_graph);
+  }
+
+  bool getUseCudaGraph() const {
+    return aitModelImpl_.getUseCudaGraph();
+  }
+
+  std::string serialize() const;
+
+  static void loadAsTorchClass();
+
+ private:
+  AITModelImpl aitModelImpl_;
+};
+
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.cpp b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
new file mode 100644
index 000000000..d47444302
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.cpp
@@ -0,0 +1,693 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#include "AITModelImpl.h" // @manual
+
+#include <type_traits>
+
+#include <dlfcn.h>
+#include <sstream>
+
+#include "ATen/Context.h" // @manual
+#ifdef __HIP_PLATFORM_HCC__
+#include "ATen/hip/HIPContext.h"
+#include "c10/core/CPUAllocator.h"
+#include "c10/hip/HIPStream.h"
+#else
+#include "ATen/cuda/CUDAContext.h"
+#include "c10/core/CPUAllocator.h"
+#include "c10/cuda/CUDAStream.h"
+#endif
+
+#ifdef FBCODE_AIT
+#include "folly/MapUtil.h"
+#endif
+
+namespace torch::aitemplate {
+
+AITemplatePyTorchCachingAllocator::AITemplatePyTorchCachingAllocator() {
+  #ifndef __HIP_PLATFORM_HCC__
+  at::globalContext().lazyInitCUDA();
+  #endif
+  cuda_allocator_ = at::cuda::getCUDADeviceAllocator();
+  TORCH_CHECK(cuda_allocator_ != nullptr);
+}
+
+void* AITemplatePyTorchCachingAllocator::Allocate(size_t num_bytes) {
+  if (num_bytes == 0) {
+    return nullptr;
+  }
+  return cuda_allocator_->raw_allocate(num_bytes);
+}
+
+void AITemplatePyTorchCachingAllocator::Free(void* ptr) {
+  if (!ptr) {
+    return;
+  }
+  cuda_allocator_->raw_deallocate(ptr);
+}
+
+namespace {
+template <typename T>
+struct GetLastArgType;
+
+template <typename T>
+struct tag {
+  using type = T;
+};
+
+template <typename Function, typename... Args>
+struct GetLastArgType<Function(Args...)> {
+  using last_arg_type = typename decltype((tag<Args>{}, ...))::type;
+};
+
+template <typename T>
+struct AITCallImpl;
+
+#define AIT_CHECK(status)                           \
+  TORCH_CHECK(                                      \
+      status == AITemplateError::AITemplateSuccess, \
+      "an AITemplate function failed")
+
+template <>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle)> {
+  // Special version for a function with no result.
+  void operator()(
+      AITemplateError (*f)(AITemplateModelHandle),
+      AITemplateModelHandle handle) {
+    AIT_CHECK(f(handle));
+  }
+};
+
+template <typename... Args>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle*, Args...)> {
+  // Special version for ModelContainer creation
+  void operator()(
+      AITemplateError (*f)(AITemplateModelHandle*, Args...),
+      AITemplateModelHandle* handle,
+      Args... args) {
+    AIT_CHECK(f(handle, args...));
+  }
+};
+
+template <typename... Args>
+struct AITCallImpl<AITemplateError(AITemplateModelHandle, Args...)> {
+  using Function = AITemplateError(AITemplateModelHandle, Args...);
+  template <typename... ArgsWithoutLastArgument>
+  auto operator()(
+      Function* f,
+      AITemplateModelHandle handle,
+      ArgsWithoutLastArgument... args) {
+    std::remove_pointer_t<typename GetLastArgType<Function>::last_arg_type>
+        result;
+    AIT_CHECK(f(handle, args..., &result));
+    return result;
+  }
+};
+
+template <typename Function, typename... Args>
+auto AITCall(Function* f, AITemplateModelHandle handle, Args... args) {
+  return AITCallImpl<Function>()(f, handle, args...);
+}
+
+template <typename Function>
+auto AITCallCreate(
+    Function* f,
+    AITemplateModelHandle* handle,
+    size_t num_runtimes,
+    AITemplateAllocator* allocator = nullptr) {
+  return AITCallImpl<Function>()(f, handle, num_runtimes, allocator);
+}
+
+std::string getFileBasename(const std::string& filename) {
+  const auto slash = filename.rfind('/');
+  return slash != std::string::npos ? filename.substr(slash + 1) : filename;
+}
+
+} // namespace
+
+AITModelImpl::AITModelImpl(
+    const std::string& model_path,
+    std::vector<std::string> input_names,
+    std::vector<std::string> output_names,
+    c10::optional<at::ScalarType> input_dtype,
+    c10::optional<at::ScalarType> output_dtype,
+    int64_t num_runtimes,
+    bool use_cuda_graph)
+    : handle_(dlopen(model_path.c_str(), RTLD_NOW | RTLD_LOCAL)),
+      library_basename_(getFileBasename(model_path)),
+      library_path_(model_path),
+      input_names_(std::move(input_names)),
+      output_names_(std::move(output_names)),
+      floating_point_input_dtype_(input_dtype),
+      floating_point_output_dtype_(output_dtype),
+      use_cuda_graph_(use_cuda_graph) {
+  LOG(INFO) << "Loading .so lib " << model_path;
+  TORCH_CHECK(handle_, "could not dlopen ", model_path, ": ", dlerror());
+  TORCH_CHECK(num_runtimes > 0, "num_runtimes must be positive");
+
+  // It's not clear what stream we want to use yet. Create a new one.
+  // We could alternatively use the default stream, but that could cause extra
+  // synchronization.
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t creation_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&creation_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, hipStreamDestroy};
+#else
+  cudaStream_t creation_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&creation_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard creation_stream_guard{creation_stream, cudaStreamDestroy};
+#endif
+
+#define LOAD_SYMBOL(var, name_str)                                       \
+  var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
+  TORCH_CHECK(var, "could not dlsym " name_str);
+
+#define LOAD_SYMBOL_WARN(var, name_str)                                  \
+  var = reinterpret_cast<decltype(var)>(dlsym(handle_.get(), name_str)); \
+  if (!var) {                                                            \
+    LOG(WARNING) << "Could not dlsym " << name_str;                      \
+  }
+
+  LOAD_SYMBOL(deleteFunc_, "AITemplateModelContainerDelete");
+  LOAD_SYMBOL(runFunc_, "AITemplateModelContainerRun");
+  LOAD_SYMBOL(getOutputNameFunc_, "AITemplateModelContainerGetOutputName");
+  LOAD_SYMBOL(
+      getMaximumOutputShapeFunc_,
+      "AITemplateModelContainerGetMaximumOutputShape");
+  LOAD_SYMBOL(getOutputDtypeFunc_, "AITemplateModelContainerGetOutputDtype");
+
+  // It's possible that these functions are not loaded in .so file.
+  // Making these function possible to load as nullptr and check when using.
+  // Once all relevant packages have been updated, we can just use
+  // LOAD_SYMBOL.
+  LOAD_SYMBOL_WARN(
+      setManyConstantsDoubleBufferFunc_,
+      "AITemplateModelContainerSetManyDoubleBufferConstants");
+  LOAD_SYMBOL_WARN(foldConstantsFunc_, "AITemplateModelContainerFoldConstants");
+  LOAD_SYMBOL_WARN(
+      getConstantNamesFunc_, "AITemplateModelContainerGetConstantNames");
+  LOAD_SYMBOL_WARN(
+      getNumConstantsFunc_, "AITemplateModelContainerGetNumConstants");
+  LOAD_SYMBOL_WARN(swapConstantsFunc_, "AITemplateModelContainerSwapConstants");
+  LOAD_SYMBOL_WARN(
+      foldConstantsDoubleBufferFunc_,
+      "AITemplateModelContainerFoldConstantsInDoubleBuffer");
+
+  // It's possible that we have new field added in AITemplateModelContainer,
+  // But we can be using a new AITModel to load an old AITemplateModelContainer.
+  // The newly added method are usually non-critical, so we issue warning
+  // instead of hard exception.
+  LOAD_SYMBOL_WARN(profileFunc_, "AITemplateModelContainerProfile");
+
+  // We never call these functions again after the constructor returns, so
+  // there's no point in caching them in member variables.
+  decltype(&AITemplateModelContainerCreate) createFunc;
+  decltype(&AITemplateModelContainerGetInputName) getInputNameFunc;
+  decltype(&AITemplateModelContainerGetNumInputs) getNumInputsFunc;
+  decltype(&AITemplateModelContainerGetNumOutputs) getNumOutputsFunc;
+  LOAD_SYMBOL(createFunc, "AITemplateModelContainerCreate");
+  LOAD_SYMBOL(getInputNameFunc, "AITemplateModelContainerGetInputName");
+  LOAD_SYMBOL(getNumInputsFunc, "AITemplateModelContainerGetNumInputs");
+  LOAD_SYMBOL(getNumOutputsFunc, "AITemplateModelContainerGetNumOutputs");
+#undef LOAD_SYMBOL
+
+  AITCallCreate(createFunc, &model_handle_, num_runtimes, &allocator_);
+
+  // TODO: this check is optional so we don't break backwards comptability.
+  // Once all relevant packages have been updated, we can just use
+  // LOAD_SYMBOL.
+  if (foldConstantsFunc_ != nullptr) {
+    AIT_CHECK(foldConstantsFunc_(
+        model_handle_,
+        /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(creation_stream),
+        /*sync=*/true));
+  }
+
+  const auto num_inputs = AITCall(getNumInputsFunc, model_handle_);
+  const auto num_outputs = AITCall(getNumOutputsFunc, model_handle_);
+
+  for (const auto idx : c10::irange(num_inputs)) {
+    input_name_to_index_.emplace(
+        AITCall(getInputNameFunc, model_handle_, idx), idx);
+  }
+  for (const auto idx : c10::irange(num_outputs)) {
+    output_name_to_index_.emplace(
+        AITCall(getOutputNameFunc_, model_handle_, idx), idx);
+  }
+}
+
+namespace {
+at::ScalarType AITemplateDtypeToTorchDtype(AITemplateDtype ait_dtype) {
+  switch (ait_dtype) {
+    case AITemplateDtype::kHalf:
+      return torch::kHalf;
+    case AITemplateDtype::kFloat:
+      return torch::kFloat;
+    case AITemplateDtype::kInt:
+      return torch::kInt;
+    case AITemplateDtype::kLong:
+      return torch::kLong;
+    case AITemplateDtype::kBool:
+      return torch::kBool;
+    case AITemplateDtype::kBFloat16:
+      return torch::kBFloat16;
+    case AITemplateDtype::kUnset:
+      TORCH_CHECK(false, "Unset AITemplate dtype");
+  }
+}
+
+AITemplateDtype TorchDtypeToAITemplateDtype(at::ScalarType torch_dtype) {
+  switch (torch_dtype) {
+    case torch::kHalf:
+      return AITemplateDtype::kHalf;
+    case torch::kFloat:
+      return AITemplateDtype::kFloat;
+    case torch::kInt:
+      return AITemplateDtype::kInt;
+    case torch::kLong:
+      return AITemplateDtype::kLong;
+    case torch::kBool:
+      return AITemplateDtype::kBool;
+    case torch::kBFloat16:
+      return AITemplateDtype::kBFloat16;
+    default:
+      TORCH_CHECK(false, "Unknown or unsupported torch dtype");
+  }
+}
+
+AITData torchToAitData(const torch::Tensor& tensor) {
+  return AITData{
+      tensor.data_ptr(),
+      AITemplateParamShape{tensor.sizes().data(), tensor.sizes().size()},
+      TorchDtypeToAITemplateDtype(tensor.scalar_type())};
+}
+
+} // namespace
+
+void AITModelImpl::allocateOutputs(
+    std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+        output_index_to_output_storage_impl,
+    std::vector<AITData>& ait_outputs,
+    std::vector<std::vector<int64_t>>& output_shapes,
+    std::vector<int64_t*>& output_shape_ptrs,
+    const c10::Device& device) {
+  RECORD_USER_SCOPE("AITModel::AllocateOutputs");
+  const auto num_outputs = output_name_to_index_.size();
+  output_index_to_output_storage_impl.resize(num_outputs);
+  const c10::DeviceGuard device_guard(device);
+  ait_outputs.reserve(num_outputs);
+  for (const auto output_index : c10::irange(num_outputs)) {
+    const auto shape =
+        AITCall(getMaximumOutputShapeFunc_, model_handle_, output_index);
+    auto output_ndim = shape.size;
+    output_shapes.emplace_back(output_ndim, 0);
+    output_shape_ptrs.emplace_back(output_shapes.back().data());
+
+    size_t size_bytes = 0;
+    AITemplateDtype ait_dtype = AITemplateDtype::kUnset;
+    ait_dtype = AITCall(getOutputDtypeFunc_, model_handle_, output_index);
+    TORCH_CHECK(
+        ait_dtype != AITemplateDtype::kUnset,
+        "Unset dtype for AITemplate output ",
+        AITCall(getOutputNameFunc_, model_handle_, output_index));
+    const auto dtype = AITemplateDtypeToTorchDtype(ait_dtype);
+    const auto size_array_ref = c10::IntArrayRef(shape.shape_data, shape.size);
+    size_bytes = at::detail::computeStorageNbytesContiguous(
+        size_array_ref, scalarTypeToTypeMeta(dtype).itemsize());
+    c10::Allocator* const allocator = at::cuda::getCUDADeviceAllocator();
+    auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+        c10::StorageImpl::use_byte_size_t(),
+        size_bytes,
+        allocator->allocate(size_bytes),
+        allocator,
+        /*resizable=*/true);
+    ait_outputs.emplace_back(storage_impl->mutable_data(), shape, ait_dtype);
+    output_index_to_output_storage_impl[output_index] = std::move(storage_impl);
+  }
+}
+
+std::vector<torch::Tensor> AITModelImpl::processOutputs(
+    std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+        output_index_to_output_storage_impl,
+    const std::vector<std::vector<int64_t>>& output_shapes) {
+  std::vector<torch::Tensor> outputs;
+  outputs.reserve(output_names_.size());
+  for (const auto& output_name : output_names_) {
+    const auto output_idx = output_name_to_index_.at(output_name);
+
+    // Now take the storage and jam it into a Tensor that has its shape set
+    // to the actual shape.
+    const auto ait_dtype =
+        AITCall(getOutputDtypeFunc_, model_handle_, output_idx);
+    // This should never fail as we checked it the first time around...
+    TORCH_CHECK(
+        ait_dtype != AITemplateDtype::kUnset,
+        "Unset dtype for AITemplate output ",
+        AITCall(getOutputNameFunc_, model_handle_, output_idx));
+    const auto dtype = AITemplateDtypeToTorchDtype(ait_dtype);
+
+    auto output = at::detail::make_tensor_base<c10::TensorImpl>(
+        std::move(output_index_to_output_storage_impl.at(output_idx)),
+        #ifdef __HIP_PLATFORM_HCC__
+        c10::DispatchKeySet(c10::DispatchKey::HIP),
+        #else
+        c10::DispatchKeySet(c10::DispatchKey::CUDA),
+        #endif
+        scalarTypeToTypeMeta(dtype));
+    const auto& size = output_shapes.at(output_idx);
+    if (size.size() != 1 || size[0] != 0) {
+      output.unsafeGetTensorImpl()->set_sizes_contiguous(size);
+    }
+
+    if (floating_point_output_dtype_ != c10::nullopt &&
+        output.is_floating_point()) {
+      outputs.emplace_back(output.to(*floating_point_output_dtype_));
+    } else {
+      outputs.emplace_back(std::move(output));
+    }
+  }
+  return outputs;
+}
+
+std::vector<AITData> AITModelImpl::processInputs(
+    std::vector<torch::Tensor>& inputs,
+    std::vector<torch::Tensor>& inputs_contig) {
+  RECORD_USER_SCOPE("AITModel::ProcessInputs");
+  const auto num_inputs = input_name_to_index_.size();
+  std::vector<AITData> ait_inputs;
+  TORCH_CHECK(
+      inputs.size() == num_inputs,
+      "User passed ",
+      inputs.size(),
+      " inputs, but the model expects ",
+      num_inputs);
+  ait_inputs.resize(inputs.size());
+  for (int python_input_idx = 0; python_input_idx < input_names_.size();
+       python_input_idx++) {
+    auto input_name = input_names_[python_input_idx];
+    const auto ait_input_idx = input_name_to_index_.at(input_name);
+    auto& input = inputs[python_input_idx];
+    if (floating_point_input_dtype_ != c10::nullopt &&
+        input.is_floating_point()) {
+      // Need to keep input alive; cannot just stash result of to()
+      // call in a local!
+      input = input.to(*floating_point_input_dtype_);
+    }
+    inputs_contig.push_back(input.contiguous());
+    auto& input_contig = inputs_contig.back();
+    auto input_shape_array_ref = input_contig.sizes();
+    ait_inputs[ait_input_idx] = AITData{
+        input_contig.data_ptr(),
+        AITemplateParamShape{
+            input_shape_array_ref.data(), input_shape_array_ref.size()},
+        TorchDtypeToAITemplateDtype(input.scalar_type())};
+  }
+  return ait_inputs;
+}
+
+std::vector<torch::Tensor> AITModelImpl::forward(
+    std::vector<torch::Tensor>& inputs) {
+  RECORD_USER_SCOPE("AITModel::Forward");
+  TORCH_CHECK(!inputs.empty());
+  const auto device = inputs[0].device();
+
+  // Process inputs
+  std::vector<torch::Tensor> inputs_contig;
+  std::vector<AITData> ait_inputs = processInputs(inputs, inputs_contig);
+
+  // Allocate outputs
+  std::vector<c10::intrusive_ptr<c10::StorageImpl>>
+      output_index_to_output_storage_impl;
+  std::vector<AITData> ait_outputs;
+  std::vector<std::vector<int64_t>> output_shapes;
+  std::vector<int64_t*> output_shape_ptrs;
+  allocateOutputs(
+      output_index_to_output_storage_impl,
+      ait_outputs,
+      output_shapes,
+      output_shape_ptrs,
+      device);
+
+  std::vector<torch::Tensor> outputs;
+  {
+    #ifdef __HIP_PLATFORM_HCC__
+    const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
+    #else
+    const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    #endif
+    const auto stream_id = cuda_stream.stream();
+    // TODO: remove casting after fixing API
+    AITemplateStreamHandle stream_handle =
+        reinterpret_cast<AITemplateStreamHandle>(stream_id);
+    RECORD_USER_SCOPE("AITModel::AITRuntime");
+    if (runFunc_(
+            model_handle_,
+            ait_inputs.data(),
+            ait_inputs.size(),
+            ait_outputs.data(),
+            ait_outputs.size(),
+            /* stream = */ stream_handle,
+            /* sync = */ false,
+            use_cuda_graph_,
+            output_shape_ptrs.data()) != AITemplateError::AITemplateSuccess) {
+      std::stringstream ss;
+      ss << "AITModel run failed with input spec: ";
+      for (const auto& i : inputs) {
+        ss << i.sizes() << ":" << i.dtype() << ", ";
+      }
+      TORCH_CHECK(false, ss.str());
+    }
+
+    // Process outputs
+    outputs =
+        processOutputs(output_index_to_output_storage_impl, output_shapes);
+  }
+  return outputs;
+}
+
+void AITModelImpl::profile(
+    std::vector<torch::Tensor>& inputs,
+    const std::string& filename,
+    size_t num_iters) {
+  TORCH_CHECK(!inputs.empty());
+  TORCH_CHECK(
+      profileFunc_,
+      "Check whether the loaded AITModelContainer.so contains Profile().");
+  const auto device = inputs[0].device();
+
+  // Process inputs
+  std::vector<torch::Tensor> inputs_contig;
+  std::vector<AITData> ait_inputs = processInputs(inputs, inputs_contig);
+
+  // Allocate outputs
+  std::vector<c10::intrusive_ptr<c10::StorageImpl>>
+      output_index_to_output_storage_impl;
+  std::vector<AITData> ait_outputs;
+  std::vector<std::vector<int64_t>> output_shapes;
+  std::vector<int64_t*> output_shape_ptrs;
+  allocateOutputs(
+      output_index_to_output_storage_impl,
+      ait_outputs,
+      output_shapes,
+      output_shape_ptrs,
+      device);
+
+  {
+    #ifdef __HIP_PLATFORM_HCC__
+    const auto& cuda_stream = at::hip::getCurrentHIPStream(device.index());
+    #else
+    const auto& cuda_stream = at::cuda::getCurrentCUDAStream(device.index());
+    #endif
+    const auto stream_id = cuda_stream.stream();
+    // TODO: remove casting after fixing API
+    AITemplateStreamHandle stream_handle =
+        reinterpret_cast<AITemplateStreamHandle>(stream_id);
+    if (profileFunc_(
+            model_handle_,
+            ait_inputs.data(),
+            ait_inputs.size(),
+            ait_outputs.data(),
+            ait_outputs.size(),
+            /* stream = */ stream_handle,
+            num_iters,
+            filename.c_str()) != AITemplateError::AITemplateSuccess) {
+      std::stringstream ss;
+      ss << "AITModel profile failed with input spec: ";
+      for (const auto& i : inputs) {
+        ss << i.sizes() << ":" << i.dtype() << ", ";
+      }
+      TORCH_CHECK(false, ss.str());
+    }
+  }
+}
+
+thread_local std::unordered_map<std::string, std::string>
+    AITModelImpl::name_to_path_map_;
+
+thread_local bool AITModelImpl::deserialize_pickled_model_{true};
+
+void AITModelImpl::registerLibraryNameToPathMap(
+    std::unordered_map<std::string, std::string> map) {
+  std::ostringstream ss;
+  ss << "{\n";
+  for (const auto& [k, v] : map) {
+    ss << "  " << k << " => " << v << ",\n";
+  }
+  ss << "}";
+
+  LOG(INFO) << "Registering .so lib paths: " << ss.str();
+  name_to_path_map_ = std::move(map);
+}
+
+const std::string& AITModelImpl::getFullPathForLibraryName(
+    const std::string& name) {
+  const std::string* path = nullptr;
+#ifdef FBCODE_AIT
+  path = folly::get_ptr(name_to_path_map_, name);
+#else
+  auto it = name_to_path_map_.find(name);
+  if (it != name_to_path_map_.end()) {
+    path = &(it->second);
+  }
+#endif
+  std::ostringstream ss;
+  ss << "{\n";
+  for (const auto& [k, v] : name_to_path_map_) {
+    ss << "  " << k << " => " << v << ",\n";
+  }
+  ss << "}";
+  TORCH_CHECK(
+      path != nullptr,
+      "could not find full path for AITemplate model .so named ",
+      name,
+      ". available paths: ",
+      ss.str());
+  return *path;
+}
+
+bool AITModelImpl::getDeserializePickledModel() {
+  return deserialize_pickled_model_;
+}
+
+// Set thread local boolean to disable real loading from .so file
+// for reusing the same module later on
+void AITModelImpl::setDeserializePickledModel(bool deserializePickledModel) {
+  deserialize_pickled_model_ = deserializePickledModel;
+}
+
+// Function to update constants in place with double buffering as well as fold
+// constants. The weights supplied must be the exact same number of the current
+// contants loaded in the AITModel. This call should only set the unused buffer
+// in the model for both direct used constants and folded constants. The weights
+// will not take effect until swapConstants is being called
+void AITModelImpl::updateConstantsWithWeights(
+    const std::unordered_map<std::string, torch::Tensor>& weights) {
+  RECORD_USER_SCOPE("AITModel::updateConstantsWithWeights");
+
+  TORCH_CHECK(
+      getNumConstantsFunc_,
+      "getNumConstantsFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      getConstantNamesFunc_,
+      "getConstantNamesFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      setManyConstantsDoubleBufferFunc_,
+      "setManyConstantsDoubleBufferFunc_ not loaded, can not do in place update");
+  TORCH_CHECK(
+      foldConstantsDoubleBufferFunc_,
+      "foldConstantsDoubleBufferFunc_ not loaded, can not do in place update");
+  VLOG(1) << "AITModelImpl in place update for weights";
+  const auto numConstants =
+      AITCall(getNumConstantsFunc_, model_handle_, false, false);
+  TORCH_CHECK(
+      numConstants == weights.size(),
+      "Number of constants loaded ",
+      numConstants,
+      " mismatched with number of new constants provided ",
+      weights.size());
+  std::vector<const char*> constantNames(numConstants, nullptr);
+  AIT_CHECK(
+      getConstantNamesFunc_(model_handle_, false, false, constantNames.data()));
+  std::vector<AITData> constants;
+  // TODO: Add check from caller side to make sure the weights are matched with
+  // loaded constants for sizes and shapes
+  for (const auto& name : constantNames) {
+    auto it = weights.find(name);
+    TORCH_CHECK(
+        it != weights.end(),
+        "could not find the constant named ",
+        name,
+        " in predictor supplied weights, ",
+        "failing this round of weight update");
+    constants.emplace_back(torchToAitData(it->second));
+  }
+
+#ifdef __HIP_PLATFORM_HCC__
+  hipStream_t constants_stream;
+  TORCH_CHECK(
+      hipStreamCreateWithFlags(&constants_stream, hipStreamNonBlocking) ==
+      hipSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<hipStream_t>,
+      decltype(&hipStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, hipStreamDestroy};
+#else
+  cudaStream_t constants_stream;
+  TORCH_CHECK(
+      cudaStreamCreateWithFlags(&constants_stream, cudaStreamNonBlocking) ==
+      cudaSuccess);
+
+  using StreamGuard = std::unique_ptr<
+      std::remove_pointer_t<cudaStream_t>,
+      decltype(&cudaStreamDestroy)>;
+  StreamGuard constants_stream_guard{constants_stream, cudaStreamDestroy};
+#endif
+  AIT_CHECK(setManyConstantsDoubleBufferFunc_(
+      model_handle_,
+      /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
+      constantNames.data(),
+      constants.data(),
+      numConstants));
+  VLOG(1) << "Completed on setting constants in double buffers";
+  AIT_CHECK(foldConstantsDoubleBufferFunc_(
+      model_handle_,
+      /*stream=*/reinterpret_cast<AITemplateStreamOpaque*>(constants_stream),
+      /*sync=*/true));
+  VLOG(1) << "Completed the constants folding process in double buffering";
+}
+
+// Swap the constants stored in the double bufferings for both model level and
+// folded constants, this will take effect immediately to make this AITModel run
+// with new weights
+void AITModelImpl::swapConstants() {
+  TORCH_CHECK(
+      swapConstantsFunc_,
+      "swapConstantsFunc_ not loaded, can not do in place update");
+  AIT_CHECK(swapConstantsFunc_(model_handle_));
+}
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/csrc/AITModelImpl.h b/fx2ait/fx2ait/csrc/AITModelImpl.h
new file mode 100644
index 000000000..56924a420
--- /dev/null
+++ b/fx2ait/fx2ait/csrc/AITModelImpl.h
@@ -0,0 +1,192 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include "model_interface.h" // @manual=//aitemplate/AITemplate/static/include:aitemplate
+
+#include <dlfcn.h>
+#include <torch/torch.h> // @manual=//caffe2:torch-cpp
+#include <memory>
+#include <optional>
+
+#ifdef FBCODE_AIT
+#include "folly/container/F14Map.h"
+#endif
+
+namespace torch::aitemplate {
+
+class AITemplatePyTorchCachingAllocator : public AITemplateAllocator {
+ public:
+  AITemplatePyTorchCachingAllocator();
+  void* Allocate(size_t num_bytes) override;
+  void Free(void* ptr) override;
+
+ private:
+  c10::Allocator* cuda_allocator_;
+};
+
+class AITModelImpl {
+ public:
+  explicit AITModelImpl(
+      const std::string& model_path,
+      std::vector<std::string> input_names,
+      std::vector<std::string> output_names,
+      c10::optional<at::ScalarType> input_dtype,
+      c10::optional<at::ScalarType> output_dtype,
+      int64_t num_runtimes = 2,
+      bool use_cuda_graph = false);
+
+  ~AITModelImpl() {
+    if (model_handle_) {
+      deleteFunc_(model_handle_);
+    }
+  }
+
+  std::vector<torch::Tensor> forward(std::vector<torch::Tensor>& inputs);
+
+  void profile(
+      std::vector<torch::Tensor>& inputs,
+      const std::string& filename,
+      size_t num_iters);
+
+  // If we need to move or copy this object, then we should just
+  // define a unique_ptr with deleter for the handle.
+  AITModelImpl(const AITModelImpl&) = delete;
+  AITModelImpl& operator=(const AITModelImpl&) = delete;
+
+  static void registerLibraryNameToPathMap(
+      std::unordered_map<std::string, std::string> map);
+
+  static const std::string& getFullPathForLibraryName(const std::string& name);
+
+  static bool getDeserializePickledModel();
+
+  static void setDeserializePickledModel(bool deserializePickledModel);
+
+  /*
+   * Returns a path to .so file (either relative or absolute).
+   */
+  const std::string& libraryPath() const {
+    return library_path_;
+  }
+
+  void setUseCudaGraph(bool use_cuda_graph) {
+    use_cuda_graph_ = use_cuda_graph;
+  }
+
+  bool getUseCudaGraph() const {
+    return use_cuda_graph_;
+  }
+
+  const std::string& libraryBasename() const {
+    return library_basename_;
+  }
+
+  const std::vector<std::string>& inputNames() const {
+    return input_names_;
+  }
+
+  const std::vector<std::string>& outputNames() const {
+    return output_names_;
+  }
+
+  const c10::optional<at::ScalarType> floatingPointInputDtype() const {
+    return floating_point_input_dtype_;
+  }
+
+  const c10::optional<at::ScalarType> floatingPointOutputDtype() const {
+    return floating_point_output_dtype_;
+  }
+
+  void updateConstantsWithWeights(
+      const std::unordered_map<std::string, torch::Tensor>& weights);
+
+  void swapConstants();
+
+ private:
+  // @lint-ignore CLANGTIDY facebook-hte-NonPodStaticDeclaration
+  static thread_local std::unordered_map<std::string, std::string>
+      name_to_path_map_;
+  static thread_local bool deserialize_pickled_model_;
+
+  struct DlcloseDeleter {
+    void operator()(void* p) const {
+      if (p) {
+        dlclose(p);
+      }
+    }
+  };
+
+  std::vector<AITData> processInputs(
+      std::vector<torch::Tensor>& inputs,
+      std::vector<torch::Tensor>& inputs_contig);
+
+  std::vector<torch::Tensor> processOutputs(
+      std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+          output_index_to_output_storage_impl,
+      const std::vector<std::vector<int64_t>>& output_shapes);
+
+  void allocateOutputs(
+      std::vector<c10::intrusive_ptr<c10::StorageImpl>>&
+          output_index_to_output_storage_impl,
+      std::vector<AITData>& ait_outputs,
+      std::vector<std::vector<int64_t>>& output_shapes,
+      std::vector<int64_t*>& output_shape_ptrs,
+      const c10::Device& device);
+
+  const std::unique_ptr<void, DlcloseDeleter> handle_ = nullptr;
+  AITemplateModelHandle model_handle_;
+
+  decltype(&AITemplateModelContainerDelete) deleteFunc_ = nullptr;
+  decltype(&AITemplateModelContainerRun) runFunc_ = nullptr;
+  decltype(&AITemplateModelContainerProfile) profileFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetOutputName) getOutputNameFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetMaximumOutputShape)
+      getMaximumOutputShapeFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetOutputDtype) getOutputDtypeFunc_ =
+      nullptr;
+  decltype(&AITemplateModelContainerSetManyDoubleBufferConstants)
+      setManyConstantsDoubleBufferFunc_ = nullptr;
+  decltype(&AITemplateModelContainerFoldConstants) foldConstantsFunc_ = nullptr;
+  decltype(&AITemplateModelContainerGetConstantNames) getConstantNamesFunc_ =
+      nullptr;
+  decltype(&AITemplateModelContainerGetNumConstants) getNumConstantsFunc_ =
+      nullptr;
+  decltype(&AITemplateModelContainerSwapConstants) swapConstantsFunc_ = nullptr;
+  decltype(&AITemplateModelContainerFoldConstantsInDoubleBuffer)
+      foldConstantsDoubleBufferFunc_ = nullptr;
+
+  const std::string library_basename_;
+  const std::string library_path_;
+  const std::vector<std::string> input_names_;
+  const std::vector<std::string> output_names_;
+  const c10::optional<at::ScalarType> floating_point_input_dtype_;
+  const c10::optional<at::ScalarType> floating_point_output_dtype_;
+#ifdef FBCODE_AIT
+  folly::F14FastMap<const char*, size_t> input_name_to_index_;
+  folly::F14FastMap<const char*, size_t> output_name_to_index_;
+#else
+  std::unordered_map<std::string, size_t> input_name_to_index_;
+  std::unordered_map<std::string, size_t> output_name_to_index_;
+#endif
+
+  // Whether to use CUDA graph when launching the model. Defaults to
+  // FLAGS_ait_model_enable_cuda_graph, but can be overridden by
+  // setUseCudaGraph().
+  bool use_cuda_graph_;
+
+  AITemplatePyTorchCachingAllocator allocator_;
+};
+} // namespace torch::aitemplate
diff --git a/fx2ait/fx2ait/example/01_transformer_model/README.md b/fx2ait/fx2ait/example/01_transformer_model/README.md
new file mode 100644
index 000000000..4baa06f5c
--- /dev/null
+++ b/fx2ait/fx2ait/example/01_transformer_model/README.md
@@ -0,0 +1,76 @@
+# Transfomer encoder
+
+In this example, we will demo how to use FX2AIT for inference on the transformer encoder block from pytorch.
+
+## Code structure
+```
+test_transformer_encoder.py     # Transformer encoder block definition using torch API
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+```
+
+## How to Use
+FX2AIT allows users to directly define a torch model, while fx2ait converter does the conversion for the usage.
+Therefore the encoder can be defined normally as
+```
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+```
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+|          1 |     0.00043845 |    2280.75893 |        0.0001806 |     5537.09872 | 2.42774396 |
+|          8 |     0.00047376 |    8443.01343 |        0.0002221 |     18009.9959 | 2.13312416 |
+|         16 |     0.00085377 |    18740.4255 |       0.00050193 |     31876.7364 | 1.90096119 |
+|         32 |     0.00150154 |    21311.3919 |       0.00069578 |     45991.3908 | 2.15806602 |
+|         64 |     0.00296888 |    21556.9773 |       0.00138113 |     46338.7065 | 2.14959202 |
+|        128 |     0.00530519 |    24127.3232 |       0.00261813 |     48889.8245 | 2.02632609 |
+|        256 |     0.01015745 |    25203.1791 |       0.00516545 |     49560.0242 | 1.96641955 |
+|        512 |     0.02023099 |    25307.7086 |       0.01034528 |     49491.1828 | 1.95557739 |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py b/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
new file mode 100644
index 000000000..096700dc3
--- /dev/null
+++ b/fx2ait/fx2ait/example/01_transformer_model/test_transformer_encoder.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from fx2ait.example.benchmark_utils import benchmark_function, verify_accuracy
+
+
+class TestTransformerModule(unittest.TestCase):
+    def test_transformer_encoder(self):
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                """
+                Inputs:
+                    input_dim - Dimensionality of the input
+                    num_heads - Number of heads to use in the attention block
+                    dim_feedforward - Dimensionality of the hidden layer in the MLP
+                    dropout - Dropout probability to use in the dropout layers
+                """
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            EncoderBlock(input_dim=768, num_heads=12, dim_feedforward=3072)
+            .cuda()
+            .half()
+        )
+
+        inputs = [torch.randn(10, 196, 768).half().cuda()]
+        verify_accuracy(model, inputs)
+
+        results = []
+        for batch_size in [1, 4, 16, 32, 64, 128, 256, 512]:
+            inputs = [torch.randn(batch_size, 196, 768).half().cuda()]
+            results.append(
+                benchmark_function(self.__class__.__name__, 100, model, inputs)
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/02_vision_model/README.md b/fx2ait/fx2ait/example/02_vision_model/README.md
new file mode 100644
index 000000000..d6525eb2e
--- /dev/null
+++ b/fx2ait/fx2ait/example/02_vision_model/README.md
@@ -0,0 +1,49 @@
+# ResNet-18
+
+In this example, we will demo how to use FX2AIT for inference on the ResNet-18 model from torchvision.
+
+## Code structure
+```
+test_vision_model.py            # ResNet definition using torch API
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+```
+
+## How to Use
+FX2AIT allows users to directly define a torch model, while fx2ait converter does the conversion for the usage.
+Therefore the definition of model is as simple as
+```
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+```
+Notice that because AIT supports channel last, while pytorch supports channel first operation, FX2AIT automatically performs this layout conversion for you.
+
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+|          1 |     0.00349264 |    286.316562 |       0.00052888 |     1890.78465 | 6.60382564 |
+|          8 |     0.00382057 |    2093.93053 |        0.0007766 |     10301.2714 | 4.91958606 |
+|         16 |     0.00351062 |    4557.59936 |       0.00098235 |     16287.4093 | 3.57368167 |
+|         32 |     0.00321071 |    9966.64244 |       0.00166504 |     19218.8053 | 1.92831291 |
+|        256 |     0.01670636 |    15323.5057 |       0.01181243 |     21672.0808 | 1.41430305 |
+|        512 |     0.03276252 |    15627.6137 |       0.02347752 |     21808.0915 | 1.39548442 |
+
+
+
+### Note for Performance Results
+
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
new file mode 100644
index 000000000..ca706ca35
--- /dev/null
+++ b/fx2ait/fx2ait/example/02_vision_model/test_vision_model.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+import torchvision
+from fx2ait.example.benchmark_utils import benchmark_function, verify_accuracy
+
+
+class TestResNet(unittest.TestCase):
+    def test_resnet18(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        verify_accuracy(
+            model,
+            inputs,
+        )
+        results = []
+        for batch_size in [1, 8, 16, 32, 256, 512]:
+            inputs = [torch.randn(batch_size, 3, 224, 224).half().cuda()]
+            results.append(
+                benchmark_function(
+                    self.__class__.__name__,
+                    100,
+                    model,
+                    inputs,
+                )
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/03_lowering_split/README.md b/fx2ait/fx2ait/example/03_lowering_split/README.md
new file mode 100644
index 000000000..1f892e054
--- /dev/null
+++ b/fx2ait/fx2ait/example/03_lowering_split/README.md
@@ -0,0 +1,90 @@
+# AIT Lowerer
+Now let's go back the example of encoder. Imagine the cases that
+1) You want to test the effect of a particular op, say MultiheadAttention on the entire module.
+2) There is some special op that AIT doesn't support.
+AIT actually provide an **automatic** Lowerer to split the graph into subgraphs and run interpreter,
+so that AIT only run the part it can handle and leave other to AITemplate.
+
+In this example, we will demo how to use AitLowerer for inference on any models.
+
+## Code structure
+```
+test_lowerr.py                  # Splited transformer encoder block to illustrate the usage of AitLowerer.
+../benchmark_utils.py           # Accuracy verification and Benchmark code for FX2AIT
+../lower/
+        lower.py                # Lower interface, which integrates lowering passes of Split subgraph and AIT Interpreter
+        ait_splitter.py         # Splitter to split graph into submodules
+        ait_setting.py          # Lowering settings
+
+```
+
+## How to Use
+To skip an operation can be extremely easy. One just need to register in the method function `@torch.fx.wrap`
+```
+@torch.fx.wrap
+def unsupported_attention_op(f, x):
+    attn_out, _ = f(x, x, x)
+    return attn_out
+```
+Then at forward stage, call the function.
+```
+        class LowerModule(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                ...
+
+            def forward(self, x):
+                # Unsupported op will not be lowered to AIT backend.
+                attn_out = unsupported_attention_op(self.attn, x)
+                ...
+```
+Then AIT won't deal with that part.
+```
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(model, inputs)
+        lower_output = lowered(*inputs)
+```
+The mechanism is that Acc tracer allows user to register wrap function so that Acc won't deal with it.
+Then our splitter will split the them into subgraph: _run_on_gpu_0 for pytorch Eager mode and _run_on_acc_1 for AIT,
+where _run_on_gpu_0 contains torch.nn.MultiheadAttention and _run_on_acc_1 contains the rest of the model.
+Finally, interpreter will be called for the AIT subgraph. (_run_on_acc_1)
+
+*Notice that our splitter only split subgraphs with more than 10 ops, since otherwise the subgraph is too small.*
+
+To run the test and benchmark,
+```
+python fx2ait/fx2ait/example/03_lowering_split/test_lower.py
+```
+
+## Reference Speed vs PyTorch Eager
+
+### A100-40GB / CUDA 11.6.2
+_PT = PyTorch 1.12 Eager_
+
+| Batch size | PT Latency (s) | PT QPS (im/s) | AIT Latency (ms) | AIT QPS (im/s) | Speedup    |
+|------------|----------------|---------------|------------------|----------------|------------|
+| 1          | 0.00065761     | 1520.66428    | 0.00076476       | 1307.59981     | 0.85988724 |
+| 4          | 0.00090687     | 4410.77681    | 0.00079056       | 5059.68597     | 1.14711902 |
+| 16         | 0.00249116     | 6422.69897    | 0.00200686       | 7972.66574     | 1.24132639 |
+| 32         | 0.00473638     | 6756.209      | 0.00396992       | 8060.62008     | 1.19306849 |
+| 64         | 0.00914742     | 6996.51201    | 0.00754977       | 8477.07749     | 1.2116148  |
+| 128        | 0.0178672      | 7163.96537    | 0.01501702       | 8523.66305     | 1.1897968  |
+| 256        | 0.03554306     | 7202.53192    | 0.02998132       | 8538.65123     | 1.18550689 |
+| 512        | 0.07118476     | 7192.55069    | 0.06006168       | 8524.56943     | 1.18519421 |
+
+From the example, we learn without AIT's multihead attention module, the speedup will be degraded to 1.2x compared to Pytorch Eager.
+
+### Note for Performance Results
+- For NVIDIA A100, our test cluster doesn't allow us to lock frequency. We make warm up longer to collect more stable results, but it is expected to have small variance to the results with locked frequency.
+- Performance results are what we can reproduce and for reference only. It should not be used for other purposes.
diff --git a/fx2ait/fx2ait/example/03_lowering_split/test_lower.py b/fx2ait/fx2ait/example/03_lowering_split/test_lower.py
new file mode 100644
index 000000000..d4da616d6
--- /dev/null
+++ b/fx2ait/fx2ait/example/03_lowering_split/test_lower.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from fx2ait.example.benchmark_utils import benchmark_function
+from fx2ait.lower.lower import AitLowerer
+from fx2ait.lower.lower_settings import LowerSettings
+
+
+@torch.fx.wrap
+def unsupported_attention_op(f, x):
+    attn_out, _ = f(x, x, x)
+    return attn_out
+
+
+class TestFx2aitLowerTests(unittest.TestCase):
+    def test_ait_lower(self):
+        class LowerModule(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, dim_feedforward),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Unsupported op will not be lowered to AIT backend.
+                attn_out = unsupported_attention_op(self.attn, x)
+                # attn_out, _ = self.attn(x,x,x)
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            LowerModule(input_dim=768, num_heads=12, dim_feedforward=3072).cuda().half()
+        )
+
+        inputs = [torch.randn(10, 196, 768).half().cuda()]
+
+        ref_output = model(*inputs)
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(model, inputs)
+        lower_output = lowered(*inputs)
+
+        # Check accuracy
+        torch.testing.assert_close(
+            ref_output, lower_output, check_dtype=False, atol=1e-2, rtol=1e-2
+        )
+        # Expect 2 submodules in target model, one is run_on_acc and another run_on_gpu
+        children = list(lowered.named_children())
+        self.assertEqual(len(children), 2)
+
+        results = []
+        for batch_size in [1, 4, 16, 32, 64, 128, 256, 512]:
+            inputs = [torch.randn(batch_size, 196, 768).half().cuda()]
+            lowered = lowerer(model, inputs)
+            results.append(
+                benchmark_function(
+                    self.__class__.__name__, 100, model, inputs, ait_mod=lowered
+                )
+            )
+        for res in results:
+            print(res)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/example/__init__.py b/fx2ait/fx2ait/example/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/fx2ait/fx2ait/example/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/example/benchmark_utils.py b/fx2ait/fx2ait/example/benchmark_utils.py
new file mode 100644
index 000000000..f2d308eb5
--- /dev/null
+++ b/fx2ait/fx2ait/example/benchmark_utils.py
@@ -0,0 +1,208 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import time
+
+import uuid
+from typing import List, Optional
+
+import torch
+from fx2ait.acc_tracer import acc_tracer
+
+from fx2ait.ait_module import AITModule
+
+from fx2ait.fx2ait import AITInterpreter
+
+
+def verify_accuracy(
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    rtol: float = 1e-01,
+    atol: float = 1e-01,
+    permute_inputs: Optional[List[int]] = None,
+    permute_outputs: Optional[List[int]] = None,
+):
+    # TODO: add precision to interpreter once AIT supports multiple precision level
+    # TODO: @qxy11 remove permute options once AIT supports channels-first format
+    mod.eval()
+    mod = acc_tracer.trace(
+        mod,
+        inputs,
+    )
+    print(mod)
+
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+    interp = AITInterpreter(
+        mod,
+        inputs,
+        "/tmp",
+        f"test-fx2ait-{uuid.uuid1()}",
+    )
+    with torch.no_grad():
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+
+        mod.eval()
+
+        start = time.perf_counter()
+        interp_result = interp.run()
+        sec = time.perf_counter() - start
+        print("Interpreter run time(s):", sec)
+        ait_mod = AITModule(
+            torch.classes.ait.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+        ref_outputs = mod(*original_inputs)
+
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        outputs = ait_mod(*cuda_inputs)
+        end_event.record()
+        torch.cuda.synchronize()
+        print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+        if isinstance(outputs, torch.Tensor):
+            ref_outputs = [ref_outputs]
+            outputs = [outputs]
+
+        for out, ref in zip(outputs, ref_outputs):
+            if not isinstance(ref, torch.Tensor):
+                ref = torch.tensor([ref])
+            ref = ref.cpu()  # to_dtype test has cases with gpu output
+            if permute_outputs:
+                out = out.permute(*permute_outputs)
+            print(out)
+            print(ref)
+            torch.testing.assert_close(
+                out.cpu(),
+                ref,
+                rtol=rtol,
+                atol=atol,
+                check_dtype=False,
+                equal_nan=True,
+            )
+
+
+def benchmark_function(
+    name: str,
+    iters: int,
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    permute_inputs: Optional[List[int]] = None,
+    ait_mod: torch.nn.Module = None,
+) -> float:
+    mod.eval()
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+    if ait_mod is None:
+        mod = acc_tracer.trace(
+            mod,
+            original_inputs,
+        )
+        interp = AITInterpreter(
+            mod,
+            inputs,
+            "/tmp",
+            f"benchmark-fx2ait-{uuid.uuid1()}",
+        )
+
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            torch.classes.ait.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            )
+        )
+
+    def benchmark(f, args):
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        print("== Start benchmark iterations")
+        with torch.inference_mode():
+            start_event.record()
+            for _ in range(iters):
+                f(*args)
+            end_event.record()
+        torch.cuda.synchronize()
+        print("== End benchmark iterations")
+        time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+        return time_per_iter_ms
+
+    with torch.inference_mode():
+        # Benchmark Pytorch Eager
+        # warmup
+        for _ in range(10):
+            mod(*original_inputs)
+        batch_size = inputs[0].shape[0]
+        pt_time_per_iter_ms = benchmark(mod, original_inputs)
+        pt_qps = batch_size / pt_time_per_iter_ms
+
+        # Benchmark FX2AIT
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+        # warmup
+        for _ in range(10):
+            ait_mod(*cuda_inputs)
+
+        ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+        ait_qps = batch_size / ait_time_per_iter_ms
+
+        result = (
+            f"== Benchmark Result for: {name}\n"
+            f"BS: {batch_size}, "
+            f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+            f"PT Eager QPS: {pt_qps:.2f}, "
+            f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+            f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+            f"Speedup: {ait_qps/pt_qps:.2f}, "
+        )
+        with open("/tmp/bench_" + name + ".csv", "a") as f:
+            f.write(
+                ",".join(
+                    map(
+                        str,
+                        [
+                            name,
+                            batch_size,
+                            pt_time_per_iter_ms,
+                            pt_qps,
+                            ait_time_per_iter_ms,
+                            ait_qps,
+                            ait_qps / pt_qps,
+                        ],
+                    )
+                )
+                + "\n"
+            )
+        return result
diff --git a/fx2ait/fx2ait/extension.py b/fx2ait/fx2ait/extension.py
new file mode 100644
index 000000000..b45c98d21
--- /dev/null
+++ b/fx2ait/fx2ait/extension.py
@@ -0,0 +1,56 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import importlib.machinery
+import logging
+import os
+
+import torch
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def is_oss_ait_model():
+    return False
+
+
+def _get_extension_path(lib_name):
+
+    lib_dir = os.path.dirname(__file__)
+
+    loader_details = (
+        importlib.machinery.ExtensionFileLoader,
+        importlib.machinery.EXTENSION_SUFFIXES,
+    )
+
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = extfinder.find_spec(lib_name)
+    if ext_specs is None:
+        raise ImportError
+
+    return ext_specs.origin
+
+
+try:
+    torch.ops.load_library("//deeplearning/ait:AITModel")
+    logger.info("===Load non-OSS AITModel===")
+
+except (ImportError, OSError):
+    lib_path = _get_extension_path("libait_model")
+    torch.ops.load_library(lib_path)
+    logger.info("===Load OSS AITModel===")
+
+    def is_oss_ait_model():  # noqa: F811
+        return True
diff --git a/fx2ait/fx2ait/fx2ait.py b/fx2ait/fx2ait/fx2ait.py
new file mode 100644
index 000000000..909a4f42e
--- /dev/null
+++ b/fx2ait/fx2ait/fx2ait.py
@@ -0,0 +1,396 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import logging
+import os
+import tempfile
+import warnings
+from datetime import datetime
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Union
+
+import torch
+
+# @manual=//aitemplate/AITemplate/python/aitemplate:aitemplate
+from aitemplate.testing import detect_target
+
+import fx2ait.cache as cache
+from fx2ait.ait_module import ARG_SPLITTER_KEYWORD
+from .converters.ait_converters import *  # isort:skip # noqa: F401 F403
+from .converters.aten2ait_converters import *  # isort:skip # noqa: F401 F403
+from aitemplate.compiler import compile_model
+from aitemplate.compiler.base import _TorchConstantTensorData
+from aitemplate.compiler.public import DynamicProfileStrategy, Tensor as AITTensor
+from aitemplate.utils.serialization.serdes_code import dump_program, get_program
+from torch.fx.node import _get_qualified_name
+from torch.fx.passes.split_utils import getattr_recursive
+
+from .converters.converter_registry import AIT_CONVERTERS
+from .tensor_spec import TensorSpec
+
+from .utils import dtype_to_str, make_str_ait_friendly
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+class AITInterpreterResult(NamedTuple):
+    engine: Any
+    input_names: Sequence[str]
+    output_names: Sequence[str]
+    fx_input_names: Sequence[str] = []
+
+
+class AITInterpreter(torch.fx.Interpreter):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        input_specs: List[Union[TensorSpec, List[TensorSpec]]],
+        workdir: str,
+        name: str,
+        dll_name: str = "test.so",
+        dynamic_profile_strategy=DynamicProfileStrategy.MAX,
+        profile_devs=None,
+        use_fp16_acc=True,
+        dump_ait_dir: Optional[str] = None,
+        keep_constants: Optional[bool] = None,
+        load_ait_dir: Optional[str] = None,
+        remote_cache_file_path: Optional[str] = None,
+        save_remote_cache: Optional[bool] = False,
+        do_optimize_graph: bool = True,
+        use_fast_math: bool = True,
+        profile_timeout: int = 500,
+        optimize_for_compilation_time: bool = False,
+    ):
+        """
+        Args:
+            module: target module for AITemplate compilation
+            input_specs: sample input for the target module
+            workdir: directory path for store AITemplate generated files
+            name: directory name for store AITemplate generated files
+            dll_name: AITemplate library name
+            dynamic_profile_strategy: A dynamic profiling strategy, used to filter
+            generated profiles at compile time.
+            See also: :func:`~aitemplate.compiler.transform.profile.profile`
+            use_fp16_acc: whether to uses fp16 accumulation for gemm ops.
+            dump_ait_dir: AIT generated file dump location.
+            keep_constants: whether to keep original constants or use random generated constants
+            load_ait_dir: location for existing ait files
+            remote_cache_file_path: AITemplate profiling cache location
+            save_remote_cache: whether to save the updated cache
+            use_fast_math: whether to use fast math in CUDA kernels
+            profile_timeout: timeout in seconds for AIT profilers to complete
+            optimize_for_compilation_time: we use O1 and disable the ProfileImpl function to reduce compilation time.
+        """
+        super().__init__(module)
+
+        missing_ops = self.validate_conversion()
+        if missing_ops:
+            warnings.warn(
+                "Interpretation will fail due to missing operations \n"
+                + "\n".join(f"{i}" for i in missing_ops)
+            )
+
+        self.remote_cache_file_path = remote_cache_file_path
+        self.save_remote_cache: bool = (
+            True if save_remote_cache and self.remote_cache_file_path else False
+        )
+        self.remote_cache_bytes = self._load_profile_cache()
+        if self.save_remote_cache:
+            self.cache_dir = os.path.join(
+                tempfile.mkdtemp(prefix="aitemplate_"), ".aitemplate"
+            )
+            os.environ["CACHE_DIR"] = self.cache_dir
+            _LOGGER.info(f"Set CACHE_DIR to {self.cache_dir}")
+        self.use_fp16_acc = use_fp16_acc
+        self.use_fast_math = use_fast_math
+        self.optimize_for_compilation_time = optimize_for_compilation_time
+        self.hardware_target = self._create_target()
+        self.input_specs = input_specs
+        self.input_specs_iter = 0
+        self.workdir = workdir
+        self.name = name
+        self.dll_name = dll_name
+        self.dynamic_profile_strategy = dynamic_profile_strategy
+        self.profile_devs = profile_devs
+
+        self._input_names: List[str] = []
+        self._output_names: List[str] = []
+        self._fx_input_names: List[str] = []
+        self._loaded_params: Dict[str, AITTensor] = {}
+
+        self.dump_ait_dir = dump_ait_dir
+        self.keep_constants = keep_constants
+        self.load_ait_dir = load_ait_dir
+        self.do_optimize_graph = do_optimize_graph
+        self.profile_timeout = profile_timeout
+
+    def _create_target(self):
+        """Detect GPU target"""
+        return detect_target(
+            use_fp16_acc=self.use_fp16_acc,
+            remote_cache_bytes=self.remote_cache_bytes,
+            use_fast_math=self.use_fast_math,
+            optimize_for_compilation_time=self.optimize_for_compilation_time,
+        )
+
+    def _load_profile_cache(self) -> bytes:
+        """
+        Load AITemplate profile cache if cache file path is provided
+        """
+        if not self.remote_cache_file_path:
+            return
+
+        cache_bytes = io.BytesIO()
+        cache.load_profile_cache(self.remote_cache_file_path, cache_bytes)
+        remote_cache_bytes = cache_bytes.getvalue()
+        _LOGGER.info(
+            f"Loaded profile cache from remote: {self.remote_cache_file_path} with length {len(remote_cache_bytes)}",
+        )
+        return remote_cache_bytes
+
+    def _upload_profile_cache(self, hardware_target) -> None:
+        """
+        Update AITemplate profile cache if cache file path is provided
+        """
+        cache_path = os.path.join(
+            self.cache_dir, hardware_target._get_cache_file_name()
+        )
+        if not self.save_remote_cache or not cache_path:
+            return
+
+        _LOGGER.info(
+            f"Uploading profile cache to remote: {self.remote_cache_file_path}",
+        )
+        cache.save_profile_cache(self.remote_cache_file_path, cache_path)
+        _LOGGER.info(
+            f"Upload AIT cache file to path {self.remote_cache_file_path} completed."
+        )
+
+    def validate_conversion(self):
+        """
+        Validate all node in target module has correspondent AIT converter support.
+        """
+        missing_converter = set()
+
+        for node in self.module.graph.nodes:
+            if node.op == "call_function" and not AIT_CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} {_get_qualified_name(node.target)}")
+            elif node.op == "call_method" and not AIT_CONVERTERS.get(node.target):
+                missing_converter.add(f"{node.op} torch.Tensor.{node.target}")
+            elif node.op == "call_module":
+                submod = self.fetch_attr(node.target)
+                submod_type = getattr(submod, "_base_class_origin", type(submod))
+                if not AIT_CONVERTERS.get(submod_type):
+                    missing_converter.add(f"{node.op} {torch.typename(submod_type)}")
+
+        return missing_converter
+
+    def run(self) -> AITInterpreterResult:
+        """
+        Build AITemplate engine
+        Returns:
+        Compiled AITemplate engine packaged as AITInterpreterResult
+        """
+        run_module_start_time = datetime.now()
+        output_tensors = super().run()
+        _LOGGER.info(
+            f"Run Module elapsed time: {datetime.now() - run_module_start_time}"
+        )
+        # FX2AIT name if composed as MODULE_NAME/submodule_name, we put all profile file on
+        # parent dir of submodule_name to share across submodules.
+        profile_dir = (
+            os.path.join(self.workdir, self.name[0 : self.name.rindex("/")])
+            if self.name.find("/") != -1
+            else self.workdir
+        )
+        args = {
+            "tensor": output_tensors,
+            "target": self.hardware_target,
+            "workdir": self.workdir,
+            "test_name": self.name,
+            "profile_devs": self.profile_devs,
+            "dynamic_profiling_strategy": self.dynamic_profile_strategy,
+            "dll_name": self.dll_name,
+            "profile_dir": profile_dir,
+            "do_optimize_graph": self.do_optimize_graph,
+            "profile_timeout": self.profile_timeout,
+        }
+        if self.dump_ait_dir:
+            dump_ait_path = os.path.join(self.dump_ait_dir, self.name + ".py")
+            random_constants = not self.keep_constants
+            dump_program(
+                output_tensors, dump_ait_path, random_constants=random_constants
+            )
+            _LOGGER.info(f"Dumped AIT model to {dump_ait_path}")
+
+        if self.load_ait_dir:
+            load_ait_path = os.path.join(self.load_ait_dir, self.name + ".py")
+            _LOGGER.info(f"Loaded AIT model from {load_ait_path}")
+            output_tensors, _ = get_program(load_ait_path)
+            if isinstance(output_tensors, AITTensor):
+                output_tensors = (output_tensors,)
+            args["tensor"] = output_tensors
+
+        self.engine = compile_model(**args)
+        ait_input_names = [
+            n._attrs["name"]
+            for n in self.engine.debug_sorted_graph
+            if n._attrs["is_input"]
+        ]
+        for name in ait_input_names:
+            assert (
+                self._fx_input_names.count(name) == 1
+            ), f"Cannot find AIT's compiled input: {name} in fx graph!"
+
+        for name in self._fx_input_names:
+            if name in ait_input_names:
+                self._input_names.append(name)
+
+        for i, input_name in enumerate(self._fx_input_names):
+            _LOGGER.info("Set input{}: {}".format(i, input_name))
+
+        if self.engine is None:
+            raise RuntimeError("Engine is missing!")
+
+        if self.save_remote_cache:
+            self._upload_profile_cache(self.hardware_target)
+
+        return AITInterpreterResult(
+            self.engine,
+            self._input_names,
+            self._output_names,
+            self._fx_input_names,
+        )
+
+    def run_node(self, n):
+        self._cur_node_name = str(n)
+        return super().run_node(n)
+
+    def placeholder(self, target, args, kwargs):
+        input_spec = self.input_specs[self.input_specs_iter]
+        self.input_specs_iter += 1
+        if isinstance(input_spec, List):
+            """
+            List[Tensor] inputs are flattened in the compiled AIT engine.
+            Pytorch module original forward:
+                def forward(self, a : Tensor, b: List[Tensor])
+                    mod.forward(a, b)
+
+            Ait compiled engine forward:
+                engine.forward(a, b##ARG_SPLITTER_KEYWORD##0, b##ARG_SPLITTER_KEYWORD##1)
+            AITModule restores calling of the original forward:
+                ait_mod.forward(a, b)
+            """
+            ait_tensors = []
+            for i, inp_spec in enumerate(input_spec):
+                target_name = f"{target}{ARG_SPLITTER_KEYWORD}{i}"
+                self._fx_input_names.append(target_name)
+                ait_tensors.append(
+                    AITTensor(
+                        shape=inp_spec.shape,
+                        dtype=dtype_to_str(inp_spec.dtype),
+                        name=target_name,
+                        is_input=True,
+                    )
+                )
+            return ait_tensors
+        elif isinstance(input_spec, TensorSpec) or isinstance(input_spec, torch.Tensor):
+            self._fx_input_names.append(target)
+            return AITTensor(
+                shape=input_spec.shape,
+                dtype=dtype_to_str(input_spec.dtype),
+                name=target,
+                is_input=True,
+            )
+        else:
+            raise AssertionError(
+                "Input spec must be a Tensor(Spec) or List of Tensor(Spec)."
+            )
+
+    def get_attr(self, target, args, kwargs):
+        attr_val = getattr_recursive(self.module, target)
+
+        if not isinstance(attr_val, (torch.Tensor, torch.nn.Parameter)):
+            raise RuntimeError(f"Unexpected get_attr value for {target}: {attr_val}")
+
+        ait_friendly_name = make_str_ait_friendly(target)
+        ait_dtype = dtype_to_str(attr_val.dtype)
+        ait_val = attr_val.contiguous()
+        if ait_friendly_name in self._loaded_params:
+            existing_tensor = self._loaded_params[ait_friendly_name]
+            assert existing_tensor._attrs["dtype"] == ait_dtype
+            assert existing_tensor._attrs["data"].tensor == ait_val
+            return existing_tensor
+
+        data = _TorchConstantTensorData(ait_val)
+        tensor = AITTensor(
+            shape=attr_val.shape, dtype=ait_dtype, name=ait_friendly_name
+        )
+        tensor._bind_data(data)
+        self._loaded_params[ait_friendly_name] = tensor
+        return tensor
+
+    def call_function(self, target, args, kwargs):
+        converter = AIT_CONVERTERS.get(target)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of function {torch.typename(target)} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, args, kwargs, self._cur_node_name)
+
+    def call_method(self, target, args, kwargs):
+        assert isinstance(target, str)
+        converter = AIT_CONVERTERS.get(target)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of method {target} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, args, kwargs, self._cur_node_name)
+
+    def call_module(self, target, args, kwargs):
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        submod_type = getattr(submod, "_base_class_origin", type(submod))
+        converter = AIT_CONVERTERS.get(submod_type)
+
+        if not converter:
+            raise RuntimeError(
+                f"Conversion of module of type {submod_type} not currently supported!"
+            )
+
+        assert self._cur_node_name is not None
+        return converter(target, submod, args, kwargs, self._cur_node_name)
+
+    def output(self, target, args, kwargs):
+        assert len(args) == 1
+        if isinstance(args[0], tuple):
+            outputs = args[0]
+        elif isinstance(args[0], list):
+            outputs = tuple(args[0])
+        else:
+            outputs = (args[0],)
+
+        for i, output in enumerate(outputs):
+            name = f"output_{i}"
+            output._attrs["name"] = name
+            output._attrs["is_output"] = True
+            self._output_names.append(name)
+
+        return outputs
diff --git a/fx2ait/fx2ait/lower/__init__.py b/fx2ait/fx2ait/lower/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/fx2ait/fx2ait/lower/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/lower/lower.py b/fx2ait/fx2ait/lower/lower.py
new file mode 100644
index 000000000..a58bb6315
--- /dev/null
+++ b/fx2ait/fx2ait/lower/lower.py
@@ -0,0 +1,251 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import dataclasses as dc
+import datetime
+import logging
+import operator
+from typing import Any, Callable, List, Optional, Sequence
+
+import fx2ait.acc_tracer.acc_tracer as acc_tracer
+
+import torch
+
+from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
+from fx2ait.ait_module import AITModule
+
+from fx2ait.ait_splitter import AITSplitter, AITSplitterSettings
+from fx2ait.fx2ait import AITInterpreter, AITInterpreterResult
+from fx2ait.tensor_spec import TensorSpec
+from torch import fx, nn
+from torch.fx.passes.splitter_base import generate_inputs_for_submodules, SplitResult
+
+from .lower_settings import LowerPrecision, LowerSettings
+
+logger: logging.Logger = logging.getLogger(__name__)
+Input = Sequence[Any]
+
+
+# A list of (function, target) pairs to not apply acc normalization
+# to when scripting. For one reason or another, these targets do
+# not play well with TorchScript after normalization.
+SCRIPTING_ACC_NORMALIZATION_BLOCKLIST = {
+    ("call_function", operator.getitem),
+    ("call_method", "to"),
+}
+
+
+@dc.dataclass
+class AitLowerInterpreter:
+    lower_settings: LowerSettings
+
+    @classmethod
+    def create(cls, lower_settings):
+        return AitLowerInterpreter(lower_settings)
+
+    def __call__(
+        self,
+        module_name: str,
+        mod: fx.GraphModule,
+        inputs: List[torch.Tensor],
+    ) -> AITInterpreterResult:
+        (additional_inputs,) = self.lower_settings.additional_inputs
+        if additional_inputs is None:
+            input_specs = TensorSpec.from_input_list_with_batch_size(
+                inputs, self.lower_settings.max_batch_size
+            )
+        else:
+            input_specs = TensorSpec.from_two_input_lists(inputs, additional_inputs)
+        logger.info("Input specs: %s", input_specs)
+
+        interpreter = AITInterpreter(
+            module=mod,
+            input_specs=input_specs,
+            workdir=self.lower_settings.workdir,
+            name=f"{self.lower_settings.name}/{module_name}",
+            dll_name=module_name + "-" + self.lower_settings.dll_name,
+            dynamic_profile_strategy=self.lower_settings.dynamic_profile_strategy,
+            profile_devs=self.lower_settings.profile_devs,
+            use_fp16_acc=self.lower_settings.use_fp16_acc,
+            remote_cache_file_path=self.lower_settings.remote_cache_file_path,
+            save_remote_cache=self.lower_settings.save_remote_cache,
+            dump_ait_dir=self.lower_settings.dump_ait_dir,
+            keep_constants=self.lower_settings.keep_constants,
+            load_ait_dir=self.lower_settings.load_ait_dir,
+        )
+
+        interp_result: AITInterpreterResult = interpreter.run()
+
+        return interp_result
+
+
+def create_ait_lower_interpreter(lower_settings: LowerSettings) -> AitLowerInterpreter:
+    return AitLowerInterpreter.create(lower_settings)
+
+
+def default_split_function(
+    model: fx.GraphModule, inputs: Input, lower_settings: LowerSettings
+) -> SplitResult:
+    settings = AITSplitterSettings(
+        min_acc_module_size=lower_settings.min_acc_module_size,
+        allow_int_inputs=lower_settings.allow_int_inputs,
+    )
+    splitter = AITSplitter(model, inputs, settings=settings)
+    splitter.node_support_preview()
+    return splitter.generate_split_results()
+
+
+def default_lower_pass(
+    create_ait_interpreter: Callable[[LowerSettings], AitLowerInterpreter],
+) -> Callable:
+    def lower_pass(
+        mod: nn.Module, input: Input, lower_settings: LowerSettings, module_name: str
+    ) -> nn.Module:
+        """
+        Create a module transformation pass which lowers an `nn.Module` into an
+        `AITModule`
+        """
+        interpreter = create_ait_interpreter(lower_settings)
+        interp_res: AITInterpreterResult = interpreter(module_name, mod, input)
+
+        # Return a scriptable module since some use cases need to script the top
+        # level module
+        return AITModule.create_ait_module_wrapper(
+            torch.classes.ait.AITModel(
+                interp_res.engine.lib_path,
+                interp_res.input_names,
+                interp_res.output_names,
+                _precision_to_torch_type(lower_settings.precision),
+                _precision_to_torch_type(lower_settings.output_precision),
+                1,  # num_runtimes
+            ),
+            interp_res,
+            lower_settings.trace_ait_module,
+            *input,
+        )
+
+    return lower_pass
+
+
+@dc.dataclass(frozen=True)
+class AitLowerer:
+    """Lowers a module using fx2ait.
+
+    This is a composable class to facilitate fx2ait. A normal fx2ait process
+    composes of the following passes to transform an `fx.GraphModule`:
+
+        1. trace - use torch.fx to trace the module so we can get the graph
+            representation of the model.
+        2. split - the graph module is split into several submodules,
+            running either via AITemplate, or via regular CUDA.
+
+    For each split that need to run via AIT, the following passes are
+    invoked:
+
+        3. `AITInterpreter` - build the AIT engine for the submodule that
+            can be supported through `AITInterpreter`.
+        4. Wraps the executable AIT engine into `AITModule`, which is an `nn.Module`.
+        5. The converted submodule is then set back onto the top-level module
+
+    """
+
+    lower_settings: LowerSettings
+    lower_pass: Callable
+    static_deps_initialized: bool = False
+
+    @staticmethod
+    def initialize_static_deps() -> None:
+        if AitLowerer.static_deps_initialized:
+            logger.info("Static deps were initialized already")
+        else:
+            logger.info("Initializing static deps")
+            update_acc_op_mappers_for_ait()
+            AitLowerer.static_deps_initialized = True
+            logger.info("Initialized static deps")
+
+    @classmethod
+    def create(
+        cls,
+        lower_settings: LowerSettings,
+        interpreter_builder: Callable = create_ait_lower_interpreter,
+    ) -> "AitLowerer":
+        """Instantiate an `AitLowerer` instance."""
+        cls.initialize_static_deps()
+
+        return cls(
+            lower_settings=lower_settings,
+            lower_pass=default_lower_pass(interpreter_builder),
+        )
+
+    def lower_func(
+        self, split_result: SplitResult, additional_inputs: Optional[Input] = None
+    ) -> nn.Module:
+        if additional_inputs:
+            additional_submodule_inputs = generate_inputs_for_submodules(
+                split_result.split_module,
+                additional_inputs,
+                list(split_result.submodule_inputs.keys()),
+            )
+        else:
+            additional_submodule_inputs = None
+
+        for submod_name, submod_inputs in split_result.submodule_inputs.items():
+            submod = getattr(split_result.split_module, submod_name)
+            # Only acc submodules will be lowered.
+            if not submod_name.startswith(split_result.non_acc_submodule_prefix):
+                logger.info(f"Now lowering submodule {submod_name}")
+                lowering_start_time = datetime.datetime.now()
+
+                self.lower_settings.additional_inputs = (
+                    additional_submodule_inputs[submod_name]
+                    if additional_submodule_inputs
+                    else None,
+                )
+
+                lowered_module = self.lower_pass(
+                    submod, submod_inputs, self.lower_settings, submod_name
+                )
+                setattr(split_result.split_module, submod_name, lowered_module)
+                logger.info(
+                    f"Lowering submodule {submod_name} elapsed time {datetime.datetime.now() - lowering_start_time}"
+                )
+
+        return split_result.split_module
+
+    def __call__(
+        self,
+        module: nn.Module,
+        inputs: Input,
+        additional_inputs: Optional[Input] = None,
+    ) -> nn.Module:
+        module.eval()
+        module = acc_tracer.trace(
+            module, inputs, leaf_module_list=self.lower_settings.leaf_module_list
+        )
+        split_result = default_split_function(module, inputs, self.lower_settings)
+        lower_result = self.lower_func(split_result, additional_inputs)
+
+        return lower_result
+
+
+def _precision_to_torch_type(
+    precision: Optional[LowerPrecision],
+) -> Optional[torch.dtype]:
+    if precision == LowerPrecision.FP16:
+        return torch.float16
+    elif precision == LowerPrecision.FP32:
+        return torch.float
+    elif precision == LowerPrecision.INT8:
+        return torch.int8
+    return None
diff --git a/fx2ait/fx2ait/lower/lower_settings.py b/fx2ait/fx2ait/lower/lower_settings.py
new file mode 100644
index 000000000..cfaef6a37
--- /dev/null
+++ b/fx2ait/fx2ait/lower/lower_settings.py
@@ -0,0 +1,87 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import dataclasses as dc
+from enum import Enum
+from typing import Any, List, Optional, Set, Type
+
+import torch
+
+from aitemplate.compiler.public import DynamicProfileStrategy
+from torch import nn
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
+
+
+@dc.dataclass
+class LowerSettings:
+    """
+    Basic configuration for lowering stack.
+    Args:
+    max_batch_size: The maximium batch_size for the input
+    min_acc_module_size: minimal number of nodes for individual accelerated submodule.
+    workdir: the working directory path.
+    name: the working directory name.
+    dll_name: AITemplate generated .so file name
+    dynamic_profile_strategy:
+        A dynamic profiling strategy, used to filter
+        generated profiles at compile time.
+    precision: The runtime precision setting
+    use_fp16_acc:
+        For LowerPrecision.FP16, use_fp16_acc can be either True or False.
+        use_fp16_acc=True uses fp16 accumulation for gemm ops.
+        use_fp16_acc=False uses fp32 accumulation for gemm ops.
+        Set use_fp16_acc=True for better perf; set use_fp16_acc=False for better accuracy.
+        For LowerPrecision.FP32, use_fp16_acc is invalid.
+    allow_int_inputs: If AIT acc subgraph accept integer inputs.
+    leaf_module_list: The list of modules that acc_tracer will not trace into.
+    output_precision: The AITemplate output precision level.
+    additional_inputs: The additional input to help determine input batch_size dimension range.
+    remote_cache_file_path: Location for AITemplate cache file.
+    save_remote_cache: Whether to save the current cache update to the cache file.
+    dump_ait_dir: Dump AIT module into python code
+    keep_constants: Whether or not to keep the constants in the dumped AIT module
+    load_ait_dir: Reload AIT module from dumped AIT python code instead.
+    """
+
+    max_batch_size: int = 2048
+    min_acc_module_size: int = 10
+    workdir: str = ""
+    name: str = ""
+    dll_name: str = "ait_engine.so"
+    dynamic_profile_strategy: DynamicProfileStrategy = DynamicProfileStrategy.MAX
+    profile_devs: Any = None
+    # If None, infer the dtypes from the sample inputs.
+    precision: Optional[LowerPrecision] = LowerPrecision.FP16
+    use_fp16_acc: bool = True  # only valid for precision == FP16
+    use_fast_math: bool = True  # Whether to use fast math in CUDA kernels
+    allow_int_inputs: bool = False  # If AIT acc subgraph accept integer inputs
+    ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None
+    leaf_module_list: Optional[Set[Type[nn.Module]]] = None
+    # If None, infer the dtypes from the sample inputs.
+    output_precision: Optional[LowerPrecision] = LowerPrecision.FP16
+    additional_inputs: Optional[List[torch.Tensor]] = None
+    remote_cache_file_path: Optional[str] = None
+    save_remote_cache: Optional[bool] = None
+    dump_ait_dir: Optional[str] = None
+    keep_constants: Optional[bool] = None
+    load_ait_dir: Optional[str] = None
+    # jit.trace AITModule
+    trace_ait_module: bool = True
+    # If True, optimize for compilation time (ie. compile w/ -O1 rather than -O3 and skip profiling codegen)
+    optimize_for_compilation_time: bool = False
diff --git a/fx2ait/fx2ait/passes/__init__.py b/fx2ait/fx2ait/passes/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/fx2ait/fx2ait/passes/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/passes/lower_basic_pass_aten.py b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
new file mode 100644
index 000000000..be34c513d
--- /dev/null
+++ b/fx2ait/fx2ait/passes/lower_basic_pass_aten.py
@@ -0,0 +1,753 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import operator
+from typing import Any, NamedTuple
+
+import torch
+import torch.fx
+from fx2ait.tools.ait_subgraph_rewriter import replace_pattern
+
+from torch.fx.experimental.const_fold import split_const_subgraphs
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.fx.passes.shape_prop import TensorMetadata
+
+_LOGGER = logging.getLogger(__name__)
+
+# Create an alias for module input type to avoid littering pyre-ignore for Any
+# throughout the file.
+Input = Any
+
+from fx2ait.acc_tracer import acc_ops
+from torch.fx import symbolic_trace
+
+
+def replacement_pattern_abstract(replacement):
+    """
+    Replace the pattern graph by a node of call_function of this `replacement`
+    """
+    traced = symbolic_trace(replacement)
+    replacement_placeholders = [
+        node for node in traced.graph.nodes if node.op == "placeholder"
+    ]
+    for n in traced.graph.nodes:
+        if n.op == "output":
+            before_output = n.all_input_nodes[0]
+            with traced.graph.inserting_after(before_output):
+                new_args = tuple(replacement_placeholders)
+                new_node = traced.graph.create_node(
+                    "call_function",
+                    replacement,
+                    args=new_args,
+                    kwargs=None,
+                )
+                before_output.replace_all_uses_with(new_node)
+    traced.graph.eliminate_dead_code()
+    traced.recompile()
+    return traced
+
+
+def run_const_fold(traced_mod: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    # Now we do constant folding on traced module.
+    def skip_folding(node: torch.fx.Node):
+        if node.target == torch.ops.aten.sym_size:
+            return True
+
+    const_split_mod = split_const_subgraphs(
+        traced_mod, skip_folding_node_fn=skip_folding
+    )
+    const_split_mod.run_folding()
+    return const_split_mod
+
+
+def nchw2nhwc_pass(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    This pass is a kind of hacky way to support some vision models. The reason is due the fact that the frontend is traced based on channel first while AIT needs channel last.
+    We need to modify
+    1) mean.dim for dim=[-1,-2] changed to [-2,-3]
+    2) dim=1 of mean.dim changed to dim=3
+    3) concat(inputs, dim=1) need to be dim=3
+    """
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == torch.ops.aten.cat.default:
+            if n.args[1] == 1:
+                new_args = list(n.args)
+                new_args[1] = 3
+        elif n.op == "call_function" and n.target == torch.ops.aten.mean.dim:
+            if n.args[1] == [-1, -2] or [-2, -1]:
+                new_args = list(n.args)
+                new_args[1] = [-2, -3]
+        else:
+            continue
+        n.args = tuple(new_args)
+        modified = True
+
+        modified_list1 = []
+        modified_list2 = []
+        modified_list3 = []
+        for u in n.users:
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 1:
+                modified_list1.append(u)
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 2:
+                modified_list2.append(u)
+            if u.target == torch.ops.aten.sym_size and u.args[1] == 3:
+                modified_list3.append(u)
+
+        for v in modified_list1:
+            new_args = list(v.args)
+            new_args[1] = 3
+            v.args = tuple(new_args)
+
+        for v in modified_list2:
+            new_args = list(v.args)
+            new_args[1] = 1
+            v.args = tuple(new_args)
+
+        for v in modified_list3:
+            new_args = list(v.args)
+            new_args[1] = 2
+            v.args = tuple(new_args)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+# TODO: delete in future
+def replace_inplace_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Remove this func after functionalization is workable
+    """
+    modified = False
+    map_func = {
+        torch.ops.aten.relu_.default: torch.ops.aten.relu.default,
+        torch.ops.aten.hardtanh_.default: torch.ops.aten.hardtanh.default,
+        torch.ops.aten.add_.Tensor: torch.ops.aten.add.Tensor,
+    }
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in map_func.keys():
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_args = node.args
+                new_node = module.graph.create_node(
+                    "call_function",
+                    map_func[node.target],
+                    args=new_args,
+                    kwargs=None,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_native_layernorm_with_layernorm(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if (
+            n.op == "call_function"
+            and n.target == torch.ops.aten.native_layer_norm.default
+        ):
+            for v in n.users:
+                if v.op == "call_function" and v.target == operator.getitem:
+                    if v.args[1] != 0:
+                        raise RuntimeError(
+                            f"Got args[{v.args[1]}]!!\n"
+                            "layernorm can only generate output (args[0]), "
+                            "not mean (args[1]) or std (args[2])!"
+                        )
+                    new_op = torch.ops.aten.layer_norm.default
+                    new_args = (*n.args, True)  # cudnn_enable=True
+                    modified = True
+                else:
+                    continue
+
+                with module.graph.inserting_after(v):
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        new_op,
+                        args=new_args,
+                        kwargs=v.kwargs,
+                    )
+                    v.replace_all_uses_with(new_node)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_transpose_mm_op_with_linear(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == torch.ops.aten.t.default:
+            to_erase = []
+            for v in n.users:
+                if v.op == "call_function" and v.target == torch.ops.aten.addmm.default:
+                    new_op = torch.ops.aten.linear
+                    bias, inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, bias)
+                    modified = True
+                elif v.op == "call_function" and v.target == torch.ops.aten.mm.default:
+                    new_op = torch.ops.aten.linear
+                    inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, None)
+                    modified = True
+                # this pass should be after `compose_bmm`
+                elif v.op == "call_function" and v.target == aten_compose_bmm_2d:
+                    new_op = torch.ops.aten.linear
+                    inp, _ = list(v.args)
+                    weight = list(n.args)[0]
+                    new_args = (inp, weight, None)
+                    modified = True
+                else:
+                    continue
+
+                with module.graph.inserting_after(v):
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        new_op,
+                        args=new_args,
+                        kwargs=v.kwargs,
+                    )
+                    v.replace_all_uses_with(new_node)
+                    to_erase.append(v)
+            for v in to_erase:
+                module.graph.erase_node(v)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_batch_norm(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Current exir.capture enable_aot and it captures bwd needed nodes in output P619801318
+    This pass removes those unused node and replace with classic aten.batch_norm
+    """
+    batch_node_list = []
+    for n in module.graph.nodes:
+        if n.target == torch.ops.aten._native_batch_norm_legit_functional.default:
+            batch_node_list.append(n)
+        if n.target == "output":
+            output_node = n
+
+    if len(batch_node_list) > 0:
+        modified = True
+    else:
+        modified = False
+    for n in batch_node_list:
+        new_op = torch.ops.aten.batch_norm
+        new_args = list(n.args)
+        new_args.append(False)
+        new_args = tuple(new_args)
+        user_list = [x for x in n.users]
+        user_list_copy_node = []
+        user_list_copy_node.append(next(iter(user_list[1].users)))
+        user_list_copy_node.append(next(iter(user_list[2].users)))
+        getitem_node = user_list[0]
+        with module.graph.inserting_after(getitem_node):
+            new_node = module.graph.create_node(
+                "call_function",
+                new_op,
+                args=new_args,
+                kwargs=n.kwargs,
+            )
+            getitem_node.replace_all_uses_with(new_node)
+
+        output_args = output_node.args[0]
+        new_output_args = [x for x in output_args if x not in user_list_copy_node]
+        output_node.args = (new_output_args,)
+        module.graph.eliminate_dead_code()
+        module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_aten_op_with_indices(module: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten.max_pool2d_with_indices.default,
+            torch.ops.aten.max_pool3d_with_indices.default,
+            torch.ops.aten.native_batch_norm.default,
+            torch.ops.aten._native_batch_norm_legit.default,
+            torch.ops.aten._native_batch_norm_legit_no_training.default,
+        ):
+            modified = True
+            if len(n.users) != 1:
+                raise RuntimeError(
+                    f"{n.target} has users={len(n.users)}. We can only handle it with 1 user"
+                )
+            if n.target == torch.ops.aten.max_pool2d_with_indices.default:
+                new_op = torch.ops.aten.max_pool2d
+                new_args = n.args
+            elif n.target == torch.ops.aten.max_pool3d_with_indices.default:
+                new_op = torch.ops.aten.max_pool3d
+                new_args = n.args
+            elif (
+                n.target == torch.ops.aten.native_batch_norm.default
+                or n.target == torch.ops.aten._native_batch_norm_legit.default
+            ):
+                new_op = torch.ops.aten.batch_norm
+                new_args = list(n.args)
+                new_args.append(False)
+                new_args = tuple(new_args)
+            elif (
+                n.target == torch.ops.aten._native_batch_norm_legit_no_training.default
+            ):
+                new_op = torch.ops.aten.batch_norm
+                new_args = list(n.args)
+                new_args.append(False)
+                # _native_batch_norm_legit_no_training doesn't take in a training arg (assumed to be false)
+                # but batchnorm takes in a training arg at position 5.
+                new_args.insert(5, False)
+                new_args = tuple(new_args)
+
+            getitem_node = next(iter(n.users))
+            with module.graph.inserting_after(getitem_node):
+                new_node = module.graph.create_node(
+                    "call_function",
+                    new_op,
+                    args=new_args,
+                    kwargs=n.kwargs,
+                )
+                getitem_node.replace_all_uses_with(new_node)
+                module.graph.erase_node(getitem_node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def replace_aten_reshape_alias_with_replace(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    # The stride parameter is not used. Replace with reshape without stride
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten._reshape_alias.default,
+        ):
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_args = (node.args[0], node.args[1])
+                new_node = module.graph.create_node(
+                    "call_function",
+                    torch.ops.aten.reshape,
+                    args=new_args,
+                    kwargs=None,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+            break
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+## Acc tracer pass, but for aten usage
+def acc_replace_reshape_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Change TensorMetadata to shapeMetadata which only contains shape field.
+    """
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == acc_ops.reshape:
+            if isinstance(n.kwargs["acc_out_ty"], TensorMetadata):
+
+                class shapeMetadata(NamedTuple):
+                    shape: torch.Size
+
+                node = n
+                with module.graph.inserting_after(node):
+                    new_kargs = {}
+                    new_kargs["input"] = node.kwargs["input"]
+                    new_kargs["acc_out_ty"] = shapeMetadata(
+                        node.kwargs["acc_out_ty"].shape
+                    )
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        acc_ops.reshape,
+                        args=node.args,
+                        kwargs=new_kargs,
+                    )
+                    node.replace_all_uses_with(new_node)
+                    module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
+
+
+def remove_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    1. Remove clone, _unsafe_view node. #TODO Remove this func after functionalization is workable
+    2. Remove inefficient op getitem(index=slice) P561572458
+    """
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (torch.ops.aten.clone.default,):
+            modified = True
+            node = n
+            input_n = node.all_input_nodes[0]
+            node.replace_all_uses_with(input_n)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (
+            torch.ops.aten._unsafe_view.default,
+        ):
+            modified = True
+            node = n
+            with module.graph.inserting_after(node):
+                new_node = module.graph.create_node(
+                    "call_function",
+                    torch.ops.aten.reshape,
+                    args=node.args,
+                    kwargs=node.kwargs,
+                )
+                node.replace_all_uses_with(new_node)
+                module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def aten_operator_getitem(*args):
+    return operator.getitem(*args)
+
+
+def replace_builtin_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    To differential the same op in fx2ait as they are registered in the same dictionary
+    """
+
+    modified = False
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target in (operator.getitem,):
+            modified = True
+            n.target = aten_operator_getitem
+    module.graph.eliminate_dead_code()
+    module.recompile()
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+###############
+"""
+Trace compose. For some ops, we do not want to decompose further but want coarse granularity
+For ex:
+1. bmm
+2. chunk
+3. getitem(input, idx=(slice(),slice()...))
+"""
+
+
+def aten_compose_getitem_slice(input, list_args):
+    for _, args in enumerate(list_args):
+        input = torch.ops.aten.slice.Tensor(input, *args)
+    return input
+
+
+def compose_getitem_slice(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed getitem(input, idx=(slice(),slice()...))
+    """
+
+    def match_pattern(module, node):
+        if node.op == "call_function" and node.target == torch.ops.aten.slice.Tensor:
+            holder = []
+            holder.append(node)
+            qualified = True
+            user_change_input = []
+
+            while qualified:
+                next_user = None
+                for user in node.users:
+                    if (
+                        user.target == torch.ops.aten.slice.Tensor
+                        and node.args[1] + 1 == user.args[1]
+                    ):
+                        next_user = user
+                    elif (
+                        user.target == torch.ops.aten.sym_size
+                        and user.args[1] == node.args[1]
+                    ):
+                        user_change_input.append(user)
+                    else:
+                        qualified = False
+                        break
+                if qualified and next_user:
+                    node = next_user
+                    holder.append(node)
+                else:
+                    qualified = False
+
+            if len(holder) == 1:
+                return (False,)
+            else:
+                return (True, holder, user_change_input)
+        return (False,)
+
+    modified = False
+    for node in module.graph.nodes:
+        res = match_pattern(module, node)
+        if res[0]:
+            modified = True
+            holder = res[1]
+            user_change_input = res[2]
+            input_n = holder[0].args[0]
+            last_n = holder[-1]
+            list_args = []
+            for h_n in holder:
+                list_args.append(h_n.args[1:])
+
+            with module.graph.inserting_after(last_n):
+                new_args = (input_n, list_args)
+                new_node = module.graph.create_node(
+                    "call_function",
+                    aten_compose_getitem_slice,
+                    args=tuple(new_args),
+                    kwargs=None,
+                )
+            last_n.replace_all_uses_with(new_node)
+            for n in user_change_input:
+                new_args = list(n.args)
+                new_args[0] = new_node
+                n.args = tuple(new_args)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+def aten_compose_mm_2d(arg0_1, arg1_1):
+    sym_size = torch.ops.aten.sym_size(arg0_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(arg0_1, 1)
+    mul = sym_size * sym_size_1
+    sym_size_2 = torch.ops.aten.sym_size(arg0_1, 2)
+    view = torch.ops.aten.view.default(arg0_1, [mul, sym_size_2])
+    mm = torch.ops.aten.mm.default(view, arg1_1)
+    sym_size_3 = torch.ops.aten.sym_size(arg1_1, 1)
+    view_1 = torch.ops.aten.view.default(mm, [sym_size, sym_size_1, sym_size_3])
+    return view_1
+
+
+def aten_compose_bmm_2d(flat_args_1, flat_args_2):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(flat_args_1, 1)
+    sym_size_2 = torch.ops.aten.sym_size(flat_args_1, 2)
+    expand = torch.ops.aten.expand.default(
+        flat_args_1, [sym_size, sym_size_1, sym_size_2]
+    )
+    view = torch.ops.aten.view.default(expand, [sym_size, sym_size_1, sym_size_2])
+    sym_size_3 = torch.ops.aten.sym_size(flat_args_2, 0)
+    sym_size_4 = torch.ops.aten.sym_size(flat_args_2, 1)
+    expand_1 = torch.ops.aten.expand.default(
+        flat_args_2, [sym_size, sym_size_3, sym_size_4]
+    )
+    view_1 = torch.ops.aten.view.default(expand_1, [sym_size, sym_size_3, sym_size_4])
+    bmm = torch.ops.aten.bmm.default(view, view_1)
+    view_2 = torch.ops.aten.view.default(bmm, [sym_size, sym_size_1, sym_size_4])
+    return view_2
+
+
+def aten_compose_bmm_3d(flat_args_1, flat_args_2):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, 0)
+    sym_size_1 = torch.ops.aten.sym_size(flat_args_1, 1)
+    sym_size_2 = torch.ops.aten.sym_size(flat_args_1, 2)
+    expand = torch.ops.aten.expand.default(
+        flat_args_1, [sym_size, sym_size_1, sym_size_2]
+    )
+    view = torch.ops.aten.view.default(expand, [sym_size, sym_size_1, sym_size_2])
+    sym_size_3 = torch.ops.aten.sym_size(flat_args_2, 1)
+    sym_size_4 = torch.ops.aten.sym_size(flat_args_2, 2)
+    expand_1 = torch.ops.aten.expand.default(
+        flat_args_2, [sym_size, sym_size_3, sym_size_4]
+    )
+    view_1 = torch.ops.aten.view.default(expand_1, [sym_size, sym_size_3, sym_size_4])
+    bmm = torch.ops.aten.bmm.default(view, view_1)
+    view_2 = torch.ops.aten.view.default(bmm, [sym_size, sym_size_1, sym_size_4])
+    return view_2
+
+
+def compose_bmm(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed bmm (matmul)
+    """
+    modified = False
+    # pattern replacement for aten_compose_mm_2d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_mm_2d...")
+    aten_compose_mm_2d_replacement = replacement_pattern_abstract(aten_compose_mm_2d)
+    res = replace_pattern(module, aten_compose_mm_2d, aten_compose_mm_2d_replacement)
+    if len(res) > 0:
+        modified = True
+    # pattern replacement for aten_compose_bmm_2d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_bmm_3d...")
+
+    def match_filter_aten_compose_bmm_2d(match, original_graph, pattern_graph):
+        if len(match.placeholder_nodes[1].meta["val"].shape) == 2:
+            return True
+        else:
+            return False
+
+    aten_compose_bmm_2d_replacement = replacement_pattern_abstract(aten_compose_bmm_2d)
+    res = replace_pattern(
+        module,
+        aten_compose_bmm_2d,
+        aten_compose_bmm_2d_replacement,
+        [match_filter_aten_compose_bmm_2d],
+    )
+    if len(res) > 0:
+        modified = True
+    # pattern replacement for aten_compose_bmm_3d
+    _LOGGER.info("compose_bmm: pattern matching for aten_compose_bmm_2d...")
+
+    def match_filter_aten_compose_bmm_3d(match, original_graph, pattern_graph):
+        if len(match.placeholder_nodes[1].meta["val"].shape) == 3:
+            return True
+        else:
+            return False
+
+    aten_compose_bmm_3d_replacement = replacement_pattern_abstract(aten_compose_bmm_3d)
+    res = replace_pattern(
+        module,
+        aten_compose_bmm_3d,
+        aten_compose_bmm_3d_replacement,
+        [match_filter_aten_compose_bmm_3d],
+    )
+    if len(res) > 0:
+        modified = True
+
+    return PassResult(module, modified)
+
+
+def aten_compose_chunk(flat_args_1, chunk, dim):
+    sym_size = torch.ops.aten.sym_size(flat_args_1, dim)
+    add = operator.add(sym_size, chunk)
+    sub = operator.sub(add, 1)
+    floordiv = operator.floordiv(sub, chunk)
+    split = torch.ops.aten.split.Tensor(flat_args_1, floordiv, dim)
+    return split
+
+
+def compose_chunk(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    combine decomposed chunk
+    """
+
+    def match_pattern(module, node):
+        if node.op == "call_function" and node.target in (torch.ops.aten.split.Tensor,):
+            div = node.args[1]
+            input = node.args[0]
+            if isinstance(div, int):
+                return (False,)
+            if div.target != operator.floordiv:
+                return (False,)
+            else:
+                div_const = div.args[1]
+                sub = div.args[0]
+                if sub.target != operator.sub:
+                    return (False,)
+                else:
+                    add = sub.args[0]
+                    if add.target != operator.add:
+                        return (False,)
+                    else:
+                        add_const = add.args[1]
+                        if add_const != div_const:
+                            return (False,)
+                        symsize = add.args[0]
+                        if symsize.target != torch.ops.aten.sym_size:
+                            return (False,)
+                        else:
+                            symsize_input = symsize.args[0]
+                            dim = symsize.args[1]
+                            if symsize_input != input:
+                                return (False,)
+
+            return (True, div_const, dim)
+        else:
+            return (False,)
+
+    modified = False
+    for node in module.graph.nodes:
+        res = match_pattern(module, node)
+        if res[0]:
+            modified = True
+            with module.graph.inserting_after(node):
+                new_args = (node.args[0], res[1], res[2])
+                new_node = module.graph.create_node(
+                    "call_function",
+                    aten_compose_chunk,
+                    args=new_args,
+                    kwargs=None,
+                )
+            node.replace_all_uses_with(new_node)
+
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return PassResult(module, modified)
+
+
+## TODO: we will remove this pass once dynamo fixed the bug
+def acc_replace_mul_ops(
+    module: torch.fx.GraphModule,
+) -> torch.fx.GraphModule:
+    """
+    Put constant at the end of multiplicaiton, i.e change 15*x.size(1) to x.size(1)*15.
+    """
+    for n in module.graph.nodes:
+        if n.op == "call_function" and n.target == acc_ops.mul:
+            if isinstance(n.kwargs["input"], int):
+                node = n
+                with module.graph.inserting_after(node):
+                    new_kargs = {}
+                    new_kargs["input"] = node.kwargs["other"]
+                    new_kargs["other"] = node.kwargs["input"]
+                    new_node = module.graph.create_node(
+                        "call_function",
+                        acc_ops.mul,
+                        args=node.args,
+                        kwargs=new_kargs,
+                    )
+                    node.replace_all_uses_with(new_node)
+                    module.graph.erase_node(node)
+    module.graph.eliminate_dead_code()
+    module.recompile()
+    return module
diff --git a/fx2ait/fx2ait/tensor_spec.py b/fx2ait/fx2ait/tensor_spec.py
new file mode 100644
index 000000000..8922c1611
--- /dev/null
+++ b/fx2ait/fx2ait/tensor_spec.py
@@ -0,0 +1,475 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+from typing import Any, Dict, List, Set, Union
+
+import torch
+from aitemplate.compiler.public import IntImm, IntVar
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TensorSpec:
+    def __init__(self, shape: List[IntVar], dtype: torch.dtype) -> None:
+        self.shape = shape
+        self.dtype = dtype
+
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, TensorSpec):
+            return False
+        if self.dtype != other.dtype:
+            return False
+        if len(self.shape) != len(other.shape):
+            return False
+        for d1, d2 in zip(self.shape, other.shape):
+            if d1 != d2:
+                return False
+        return True
+
+    def __str__(self) -> str:
+        return "TensorSpec[shape=[{}],dtype={}]".format(
+            ",".join([str(d) for d in self.shape]), self.dtype
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    @classmethod
+    def from_two_input_lists(
+        cls,
+        inputs1: List[Union[torch.Tensor, List[torch.Tensor]]],
+        inputs2: List[Union[torch.Tensor, List[torch.Tensor]]],
+    ) -> List["TensorSpec"]:
+        """
+        This function is useful when we expect multiple dynamic dims.
+
+        The parent graph can receive two sets of inputs:
+        1. with min dynamic dim values,
+        2. with max dynamic dim values.
+
+        After FX splitter logic is applied and lowerable subgraph sample inputs
+        are inferred, we make two assumptions:
+        1. two lists of inferred inputs will differ at dynamic dimensions,
+        2. the difference numbers will be the dynamic ranges, i.e. min and max.
+
+        TODO: The assumptions above are not ideal, and, in theory, we should do
+        symbolic shape propagation using something like SymPy.
+        """
+        if len(inputs1) != len(inputs2):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs1)} vs {len(inputs2)}"
+            )
+
+        result: List[TensorSpec] = []
+
+        for t1, t2 in zip(inputs1, inputs2):
+            if isinstance(t1, list):
+                result.append(cls.from_two_input_lists(t1, t2))
+                continue
+            if t1.dtype != t2.dtype:
+                raise ValueError(f"Different types: {t1.dtype} vs {t2.dtype}")
+            if len(t1.shape) != len(t2.shape):
+                raise ValueError(
+                    f"Different tensor sizes: {len(t1.shape)} vs {len(t2.shape)}"
+                )
+            shape: List[IntVar] = []
+            for i, (d1, d2) in enumerate(zip(t1.shape, t2.shape)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    shape.append(IntVar([min(d1, d2), max(d1, d2)], f"dynamic_dim_{i}"))
+            result.append(TensorSpec(shape, t1.dtype))
+
+        return result
+
+    @classmethod
+    def from_two_input_lists_jagged_tensor(
+        cls, inputs1: List[torch.Tensor], inputs2: List[torch.Tensor]
+    ) -> List["TensorSpec"]:
+        """
+        This function is useful when we expect multiple dynamic dims.
+
+        The parent graph can receive two sets of inputs:
+        1. with min dynamic dim values,
+        2. with max dynamic dim values.
+
+        After FX splitter logic is applied and lowerable subgraph sample inputs
+        are inferred, we make two assumptions:
+        1. two lists of inferred inputs will differ at dynamic dimensions,
+        2. the difference numbers will be the dynamic ranges, i.e. min and max.
+
+        TODO: The assumptions above are not ideal, and, in theory, we should do
+        symbolic shape propagation using something like SymPy.
+        """
+        if len(inputs1) != len(inputs2):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs1)} vs {len(inputs2)}"
+            )
+
+        result: List[TensorSpec] = []
+        dynamic_dict = {}
+        num_dynamic = 0
+        for t1, t2 in zip(inputs1, inputs2):
+            if t1.dtype != t2.dtype:
+                raise ValueError(f"Different types: {t1.dtype} vs {t2.dtype}")
+            if len(t1.shape) != len(t2.shape):
+                raise ValueError(
+                    f"Different tensor sizes: {len(t1.shape)} vs {len(t2.shape)}"
+                )
+            shape: List[IntVar] = []
+            for _, (d1, d2) in enumerate(zip(t1.shape, t2.shape)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    dynamic_range = [min(d1, d2), max(d1, d2)]
+                    tuple_range = tuple(dynamic_range)
+                    dynamic_name = dynamic_dict.get(tuple_range, None)
+                    # The rule here we record the dynamic range+dynamic name in a dict
+                    # and extract it if it exists. If not, we will create the pair and append
+                    # the pair to the dict
+                    if dynamic_name:
+                        shape.append(IntVar(dynamic_range, dynamic_name))
+                    else:
+                        if num_dynamic == 0:
+                            shape.append(IntVar(dynamic_range, "batch_size"))
+                            dynamic_dict[tuple_range] = "batch_size"
+                        else:
+                            shape.append(
+                                IntVar(
+                                    dynamic_range,
+                                    f"batch_size_{num_dynamic}",
+                                )
+                            )
+                            dynamic_dict[tuple_range] = f"batch_size_{num_dynamic}"
+                        num_dynamic = num_dynamic + 1
+            result.append(TensorSpec(shape, t1.dtype))
+
+        return result
+
+    @classmethod
+    def gen_int_var_min_max(cls, vmin: int, vmax: int, name: str = None):  # noqa [B902]
+        values = [vmin, vmax]
+        if vmin == vmax:
+            return IntImm(vmin, name=name)
+        elif vmin < vmax:
+            return IntVar([vmin, vmax], name=name)
+        else:
+            raise RuntimeError("Unsupported int var definition: {}".format(values))
+
+    @classmethod
+    def create_spec_from_int_vars(cls, int_vars: List[IntVar], dtype_list: torch.dtype):
+        if len(int_vars) != len(dtype_list):
+            raise ValueError(
+                f"Different number of int_var and dtype_list: {len(int_vars)} vs {len(dtype_list)}"
+            )
+        res = []
+        for int_var, dtype in zip(int_vars, dtype_list):
+            res.append(TensorSpec(int_var, dtype))
+        return res
+
+    @classmethod
+    def create_spec_from_shapes(
+        cls, inputs_min: List[int], inputs_max: List[int], dtype_list: torch.dtype
+    ) -> List["TensorSpec"]:
+        if len(inputs_min) != len(inputs_max):
+            raise ValueError(
+                f"Different number of inputs: {len(inputs_min)} vs {len(inputs_max)}"
+            )
+        res = []
+        for shape1, shape2, dtype in zip(inputs_min, inputs_max, dtype_list):
+            if len(shape1) != len(shape2):
+                raise ValueError(
+                    f"Different number of input dims: {len(shape1)} vs {len(shape2)}"
+                )
+
+            shape: List[IntVar] = []
+            for i, (d1, d2) in enumerate(zip(shape1, shape2)):
+                if d1 == d2:
+                    shape.append(IntImm(d1))
+                else:
+                    shape.append(IntVar([min(d1, d2), max(d1, d2)], f"dynamic_dim_{i}"))
+            res.append(TensorSpec(shape, dtype))
+        return res
+
+    def to_random_tensor(self, use_lower_bound=True):
+        shape = []
+        for s in self.shape:
+            if use_lower_bound:
+                shape.append(s.lower_bound())
+            else:
+                shape.append(s.upper_bound())
+        return torch.randn(shape).to(dtype=self.dtype)
+
+    def to_specific_tensor(self, use_lower_bound, specify_num):
+        shape = []
+        for s in self.shape:
+            if use_lower_bound:
+                shape.append(s.lower_bound())
+            else:
+                shape.append(s.upper_bound())
+        return torch.full(shape, specify_num).to(dtype=self.dtype)
+
+    @classmethod
+    def create_inputs_from_specs(
+        cls, input_specs: List["TensorSpec"], use_lower_bound: bool, specify_num=None
+    ) -> torch.Tensor:
+        result = []
+        for inp in input_specs:
+            if specify_num is None:
+                result.append(inp.to_random_tensor(use_lower_bound).cuda())
+            else:
+                result.append(
+                    inp.to_specific_tensor(use_lower_bound, specify_num).cuda()
+                )
+
+        return result
+
+    @classmethod
+    def from_input_list_with_batch_size(
+        cls, inputs: List[torch.Tensor], max_batch_size: int, batch_dim: int = 0
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        result: List[TensorSpec] = []
+
+        bs_dim = cls.find_batch_size_dim(inputs)
+        for index, t in enumerate(inputs):
+            shape: List[IntVar] = []
+            for i, d in enumerate(t.shape):
+                if i == bs_dim[index]:
+                    shape.append(IntVar([1, max_batch_size], "batch_size"))
+                else:
+                    shape.append(IntImm(d))
+            result.append(TensorSpec(shape, t.dtype))
+
+        return result
+
+    @staticmethod
+    def _get_max_seq_lens_from_offsets(
+        inputs: List[torch.Tensor],
+        jagged_offsets_batch_dims: Set[int],
+    ) -> Dict[int, int]:
+        """
+        Get the maximum sequence length encoded in each offsets tensor.
+
+        Offsets tensors encode the length of each sequence in the corresponding
+        jagged tensor. Here we extract the maximum sequence length in each offsets
+        tensor in the inputs and associate it with the total_length (== offsets[-1])
+        of the corresponding jagged tensor in the inputs.
+        """
+        offsets_inputs = [
+            inp
+            for inp in inputs
+            # offsets tensors are rakn-1 and have a specific first dimension
+            if len(inp.shape) == 1 and inp.shape[0] in jagged_offsets_batch_dims
+        ]
+
+        max_seq_lens = {}
+        for offsets in offsets_inputs:
+            offsets = offsets.cpu()
+            # the last value in the offsets tensor is the total_length
+            # dimension of the corresponding jagged tensor
+            total_length = offsets[-1].item()
+            # max. sequence length == max. consecutive offset difference
+            max_seq_len = torch.max(offsets[1:] - offsets[:-1]).item()
+            if total_length in max_seq_lens:
+                # if multiple jagged tensors have the same total length,
+                # set the max_seq_len to the maximum of all sequences
+                # in all corresponding offsets tensors
+                max_seq_lens[total_length] = max(
+                    max_seq_lens[total_length], max_seq_len
+                )
+            else:
+                max_seq_lens[total_length] = max_seq_len
+
+            logger.info(
+                f"Maximum sequence length {max_seq_lens[total_length]} "
+                f"for the input jagged tensor with {total_length=} "
+                "inferred from the offsets tensor."
+            )
+
+        return max_seq_lens
+
+    @classmethod
+    def from_input_list_with_batch_size_jagged_tensor(
+        cls,
+        inputs: List[torch.Tensor],
+        max_batch_size: int,
+        max_sequence_length: int,
+        jagged_tensor_batch_dims: Set[int],
+        jagged_offsets_batch_dims: Set[int],
+        additional_inputs: List[torch.Tensor] = None,
+        infer_max_seq_lens_from_offsets: bool = False,
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        max_seq_lens_from_offsets = {}
+        if infer_max_seq_lens_from_offsets:
+            max_seq_lens_from_offsets = cls._get_max_seq_lens_from_offsets(
+                inputs=inputs,
+                jagged_offsets_batch_dims=jagged_offsets_batch_dims,
+            )
+
+        result: List = []
+        result_unsorted: List = []
+        left_inputs: List = []
+        left_inputs_ind: List = []
+        left_additional_inputs: List = []
+        for ind, t in enumerate(inputs):
+            batch_dim: int = t.shape[0]
+            batch_dim_lower_bound: int = 0
+            batch_dim_upper_bound: int = 0
+            batch_dim_name: str = ""
+            if batch_dim in jagged_tensor_batch_dims:
+                batch_dim_lower_bound = 0  # when all sequences are empty
+                # if the maximum sequence length for this jagged tensor was not
+                # inferred from the offsets, we use the globally configured
+                # max_sequence_length (passed as argument to this function)
+                max_seq_len = max_seq_lens_from_offsets.get(
+                    batch_dim, max_sequence_length
+                )
+                batch_dim_upper_bound = max_batch_size * max_seq_len
+                batch_dim_name = f"batch_size_jagged_tensor_{batch_dim}"
+            elif batch_dim in jagged_offsets_batch_dims:
+                batch_dim_lower_bound = 2  # prefix 0 + at least one offset
+                batch_dim_upper_bound = max_batch_size + 1
+                batch_dim_name = f"batch_size_jagged_offsets_{batch_dim}"
+
+            if batch_dim_upper_bound > 0:
+                shape: List[IntVar] = []
+                for i, d in enumerate(t.shape):
+                    if i == 0:
+                        shape.append(
+                            IntVar(
+                                values=[
+                                    batch_dim_lower_bound,
+                                    batch_dim_upper_bound,
+                                ],
+                                name=batch_dim_name,
+                            )
+                        )
+                    else:
+                        shape.append(IntImm(d))
+                result_unsorted.append((ind, TensorSpec(shape, t.dtype)))
+            else:
+                left_inputs.append(t)
+                left_inputs_ind.append(ind)
+
+        if additional_inputs:
+            for ind in left_inputs_ind:
+                left_additional_inputs.append(additional_inputs[ind])
+            input_specs_left = TensorSpec.from_two_input_lists_jagged_tensor(
+                left_inputs, left_additional_inputs
+            )
+            assert len(input_specs_left) == len(
+                left_inputs_ind
+            ), "Unexpected length for left inputs"
+
+            for index, ind_value in enumerate(left_inputs_ind):
+                result_unsorted.append((ind_value, input_specs_left[index]))
+
+        else:
+            bs_dim = cls.find_batch_size_dim(left_inputs)
+            for index, t in enumerate(left_inputs):
+                shape: List[IntVar] = []
+                for i, d in enumerate(t.shape):
+                    if i == bs_dim[index]:
+                        shape.append(IntVar([1, max_batch_size], "batch_size"))
+                    else:
+                        shape.append(IntImm(d))
+                result_unsorted.append(
+                    (left_inputs_ind[index], TensorSpec(shape, t.dtype))
+                )
+        result = sorted(result_unsorted, key=lambda num: num[0])
+        result = [r[1] for r in result]
+        return result
+
+    @classmethod
+    # pyre-ignore [2]: Parameter `sample_input` must have a type other than `Any`
+    def find_batch_size_dim(cls, inputs: Any) -> []:
+        if isinstance(inputs, torch.Tensor) or len(inputs) <= 1:
+            return [0]
+        shapes = [i.shape for i in inputs]
+        frequency_map = {}
+        position_scores = {}
+        first_dims = set()
+        for shape in shapes:
+            if len(shape) < 2:
+                # By pass for rank-1 tensors. MRS model has rank-1 tensor carry no batch_size info
+                continue
+            # Dedup shape value for single tensor
+            first_dims.add(shape[0])
+            seen_dims = set()
+            for i, dim in enumerate(shape):
+                if dim not in seen_dims:
+                    frequency_map[dim] = frequency_map.get(dim, 0) + 1
+                    position_scores[dim] = position_scores.get(dim, 0) + i
+                    seen_dims.add(dim)
+
+        if len(first_dims) == 1:
+            # first dim is the same in every input: we use it as batch_size
+            batch_size = first_dims.pop()
+        elif frequency_map:
+            # first dims are different: we use the most frequent dim as batch_size
+            # if there is more than 1 most frequent dim, we choose the one with the
+            # lowest position score (i.e., the leftmost of the most frequent ones)
+            sorted_frequency = sorted(
+                frequency_map.items(),
+                key=lambda x: (-x[1], position_scores[x[0]]),
+            )
+            batch_size = sorted_frequency[0][0]
+        else:
+            # no dims to sort: no batch_size
+            batch_size = -1
+
+        bs_dim = []
+        for i in inputs:
+            # Default batch size dim = -1, indicate no batch_size
+            dim = -1
+            for index, val in enumerate(i.shape):
+                if val == batch_size:
+                    dim = index
+                    break
+            bs_dim.append(dim)
+
+        return bs_dim
+
+    @classmethod
+    def from_input_list_with_batch_size_static_batch(
+        cls, inputs: List[torch.Tensor], max_batch_size: int, batch_dim: int = 0
+    ) -> List["TensorSpec"]:
+        """
+        Most of the recommendation models will work fine using this function.
+
+        We make an assumption that inferred lowerable subgraph inputs will have
+        a single batch dimension with the same max batch size.
+        """
+        result: List[TensorSpec] = []
+
+        for t in inputs:
+            shape: List[IntVar] = []
+            for _, d in enumerate(t.shape):
+                shape.append(IntImm(d))
+            result.append(TensorSpec(shape, t.dtype))
+
+        return result
diff --git a/fx2ait/fx2ait/test/__init__.py b/fx2ait/fx2ait/test/__init__.py
new file mode 100644
index 000000000..c91744130
--- /dev/null
+++ b/fx2ait/fx2ait/test/__init__.py
@@ -0,0 +1,21 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import sys
+
+from . import test_ait_lower, test_fx2ait  # noqa
+
+if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
+    PY3STATEMENT = "The minimal Python requirement is Python 3.7"
+    raise Exception(PY3STATEMENT)
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
new file mode 100644
index 000000000..afca07a91
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_transformer_model.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestTransformerModelConverter(AITTestCase):
+    def test_transformer_encoder(self):
+        torch.manual_seed(0)
+
+        class EncoderBlock(torch.nn.Module):
+            def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.0):
+                """
+                Inputs:
+                    input_dim - Dimensionality of the input
+                    num_heads - Number of heads to use in the attention block
+                    dim_feedforward - Dimensionality of the hidden layer in the MLP
+                    dropout - Dropout probability to use in the dropout layers
+                """
+                super().__init__()
+                # Attention layer
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=input_dim,
+                    num_heads=num_heads,
+                    batch_first=True,
+                )
+                # # Two-layer MLP
+                self.linear_net = torch.nn.Sequential(
+                    torch.nn.Linear(input_dim, dim_feedforward),
+                    torch.nn.Dropout(dropout),
+                    torch.nn.ReLU(inplace=True),
+                    torch.nn.Linear(dim_feedforward, input_dim),
+                )
+                # Layers to apply in between the main layers
+                self.norm1 = torch.nn.LayerNorm(input_dim)
+                self.norm2 = torch.nn.LayerNorm(input_dim)
+                self.dropout = torch.nn.Dropout(dropout)
+
+            def forward(self, x):
+                # Attention part
+                attn_out, _ = self.attn(query=x, key=x, value=x)
+                # return attn_out
+                x = x + self.dropout(attn_out)
+                x = self.norm1(x)
+
+                # MLP part
+                linear_out = self.linear_net(x)
+                x = x + self.dropout(linear_out)
+                x = self.norm2(x)
+
+                return x
+
+        model = (
+            EncoderBlock(input_dim=512, num_heads=16, dim_feedforward=12).cuda().half()
+        )
+
+        inputs = [torch.randn(10, 32, 512).half().cuda()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
new file mode 100644
index 000000000..3a0a72d02
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_model/test_ait_vision_model.py
@@ -0,0 +1,39 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+import torchvision
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestVisionModelConverter(AITTestCase):
+    def test_resnet50(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_outputs=None,
+        )
diff --git a/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
new file mode 100644
index 000000000..4641745ed
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/converters_module/test_ait_multihead_attention.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestMultiHeadAttentionConverter(AITTestCase):
+    def test_multihead_attention_cross_attenytion(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, dim, nheads):
+                super().__init__()
+                self.attn = torch.nn.modules.activation.MultiheadAttention(
+                    embed_dim=dim,
+                    num_heads=nheads,
+                    batch_first=True,
+                )
+
+            def forward(self, x):
+                layer_norm = torch.nn.functional.layer_norm(x, (dim,), eps=1e-5)
+                getitem = layer_norm[slice(None, None, None), 0]
+                unsqueeze = torch.unsqueeze(getitem, dim=1)
+
+                return self.attn(query=unsqueeze, key=layer_norm, value=layer_norm)
+
+        seq_len_q, dim, nheads = 4, 256, 16
+        model = TestModule(dim, nheads).half().cuda()
+        input_q = torch.randn(128, seq_len_q, dim).cuda().half()
+        self.run_test(
+            model,
+            [input_q],
+            expected_ops={
+                torch.nn.modules.activation.MultiheadAttention,
+                acc_ops.layer_norm,
+                acc_ops.unsqueeze,
+                acc_ops.getitem,
+            },
+            leaf_module=torch.nn.MultiheadAttention,
+        )
+
+    def test_multihead_attention(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, dim, nheads):
+                super().__init__()
+                self.attn = torch.nn.MultiheadAttention(
+                    embed_dim=dim,
+                    num_heads=nheads,
+                    batch_first=True,
+                )
+
+            def forward(self, x):
+                return self.attn(query=x, key=x, value=x)
+
+        batch_size = 2
+        seqlen = 4
+        dim = 512
+        num_heads = 8
+
+        x = torch.ones(batch_size, seqlen, dim).cuda().half()
+        model = TestModule(dim, num_heads).eval().half().cuda()
+
+        self.run_test(
+            model,
+            [x],
+            expected_ops={torch.nn.MultiheadAttention},
+            leaf_module=torch.nn.MultiheadAttention,
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
new file mode 100644
index 000000000..3fab2361f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_adaptive_avg_pool2d.py
@@ -0,0 +1,50 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+#!/usr/bin/env fbpython
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAdaptiveAvgPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64),),
+            ((128, 128),),
+            (64,),
+        ]
+    )
+    def test_adaptive_avgpool2d(
+        self,
+        output_size,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 32, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.adaptive_avg_pool2d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
new file mode 100644
index 000000000..781ed8ce5
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_avg_pool2d.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAvgPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.avg_pool2d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
new file mode 100644
index 000000000..017efcfd5
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_batch_norm.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestBatchNormConverter(AITTestCase):
+    def test_batch_norm1d(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm1d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs1 = [torch.randn(5, 3).cuda().half()]
+        self.run_test(
+            model,
+            inputs1,
+            expected_ops={acc_ops.batch_norm},
+        )
+
+        inputs2 = [torch.randn(5, 3, 234).cuda().half()]
+        self.run_test(
+            model,
+            inputs2,
+            expected_ops={acc_ops.batch_norm},
+        )
+
+    def test_batch_norm2d(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.batch_norm},
+        )
+
+    def test_batch_norm3d(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm3d(6)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(4, 6, 24, 24, 11).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.batch_norm},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_binary_op.py b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
new file mode 100644
index 000000000..1a13daaa9
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_binary_op.py
@@ -0,0 +1,169 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import operator
+from typing import Callable, List, Tuple, Union
+
+import torch
+
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+TWO_TENSOR_INPUTS = [
+    (torch.randn(2, 3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(3, 4)),
+    (torch.randn(1, 1, 1), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(1)),
+    (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
+    (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
+    (torch.randn(1), torch.randn(2, 3, 4)),
+    (torch.randn(3, 2, 1), torch.randn(1, 2, 2)),
+]
+
+
+class TestBinaryOpConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [
+                "add",
+                operator.add,
+                acc_ops.add,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "sub",
+                operator.sub,
+                acc_ops.sub,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "mul",
+                operator.mul,
+                acc_ops.mul,
+                TWO_TENSOR_INPUTS,
+            ],
+            # Add .clamp() to avoid division by zero
+            [
+                "div",
+                operator.truediv,
+                acc_ops.div,
+                [(lhs, rhs.clamp(min=0.01)) for lhs, rhs in TWO_TENSOR_INPUTS],
+            ],
+            # TODO enable full list of test when OSS python version upgrade to include pyhton floordiv fix
+            # [
+            #     "floor_div",
+            #     operator.floordiv,
+            #     acc_ops.floor_div,
+            #     [
+            #         (TWO_TENSOR_INPUTS[i][0], TWO_TENSOR_INPUTS[i][1].clamp(min=0.01))
+            #         for i in range(0, 2)
+            #     ],
+            # ],
+        ]
+    )
+    def test_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        acc_op: Callable,
+        inputs: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        for lhs, rhs in inputs:
+            model = TestModule().cuda()
+            lhs = lhs.half().cuda()
+            rhs = rhs.half().cuda()
+            self.run_test(model, [lhs, rhs], expected_ops={acc_op})
+
+    @parameterized.expand(
+        [
+            param("add_int", 1, operator.add, acc_ops.add),
+            param("add_float", 0.5, operator.add, acc_ops.add),
+            param("mul_int", 1, operator.mul, acc_ops.mul),
+            param("mul_float", 0.5, operator.mul, acc_ops.mul),
+            param("div_int", 1, operator.truediv, acc_ops.div),
+            param("div_float", 0.5, operator.truediv, acc_ops.div),
+        ]
+    )
+    def test_scalar_operand(
+        self, name: str, scalar: Union[int, float], op: Callable, acc_op: Callable
+    ) -> None:
+        class TestModuleScalarLhs(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return op(scalar, x)
+
+        class TestModuleScalarRhs(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return op(x, scalar)
+
+        model_scalar_lhs = TestModuleScalarLhs().cuda()
+        self.run_test(
+            model_scalar_lhs,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+        model_scalar_rhs = TestModuleScalarRhs().cuda()
+        self.run_test(
+            model_scalar_rhs,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+    @parameterized.expand(
+        [
+            param("add", 1, 3, operator.add, acc_ops.add),
+            param("mul", 0.5, 1, operator.mul, acc_ops.mul),
+            param("sub", 1, 0.5, operator.sub, acc_ops.sub),
+            param("div", 0.5, 0.5, operator.truediv, acc_ops.div),
+        ]
+    )
+    def test_constant_operand(
+        self,
+        name: str,
+        x: Union[int, float],
+        y: Union[int, float],
+        op: Callable,
+        acc_op: Callable,
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                x = op(input.size()[-1], input.size()[-1])
+                return op(x, input)
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 4).half().cuda()],
+            expected_ops={acc_op},
+        )
+
+    # This is a common binary op combo usage for ads models.
+    def test_binary_op_combo(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                x = input.size()[0] * input.size()[0]
+                return torch.reshape(input, [-1, x])
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 4).half().cuda()],
+            expected_ops={acc_ops.reshape, acc_ops.mul},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_cat.py b/fx2ait/fx2ait/test/converters/test_ait_cat.py
new file mode 100644
index 000000000..870e54cdf
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_cat.py
@@ -0,0 +1,100 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Callable
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestCatConverter(AITTestCase):
+    combo = [
+        ["default", 0, torch.cat],
+        ["positive_dim", 1, torch.cat],
+        ["negative_dim", -1, torch.cat],
+        ["default", 0, torch.concat],
+        ["positive_dim", 1, torch.concat],
+        ["negative_dim", -1, torch.concat],
+    ]
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.cat})
+
+    @parameterized.expand([(name, dim, op) for name, dim, op in combo])
+    def test_cat_dynamic_shape(self, name: str, dim: int, op: Callable):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return op([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+                [2, 3, 4],
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+                [20, 3, 4],
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={acc_ops.cat})
+
+    def test_linear_stack(self):
+        """Pass _fuse_strided_op_and_cat shouldn't try to fuse GEMM and concat
+        in this graph, because that would require GEMM last dimension to be
+        non-contiguous, which it doesn't support. This is checked by
+        _dim_is_inside_original_shape in transform_strided_ops.
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                self.w = torch.randn(4, 3).half().cuda()
+                y = torch.nn.functional.linear(x, self.w)
+                z = torch.randn(2, 4).half().cuda()
+                return torch.stack([y, z], dim=2)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_chunk.py b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
new file mode 100644
index 000000000..6849df4f2
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_chunk.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestChunkConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 2, [3, 10, 2], 1),
+            param("no_dim", 2, [3, 10, 2]),
+            param("neg_dim", 1, [3, 10, 2], -2),
+            param("chunk_bigger_than_dim", 4, [2, 10, 2], 2),
+        ]
+    )
+    def test_chunk(self, name, chunks, shape, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = (
+                    torch.chunk(x, chunks=chunks, dim=dim)
+                    if dim is not None
+                    else torch.chunk(x, chunks=chunks)
+                )
+                # For AIT, all chunk results must be used
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.chunk})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_clamp.py b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
new file mode 100644
index 000000000..9d6cd8a71
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_clamp.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestClampConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.clamp})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_contiguous.py b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
new file mode 100644
index 000000000..a58816747
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_contiguous.py
@@ -0,0 +1,32 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestContiguousConverter(AITTestCase):
+    def test_contigupus(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x) -> torch.Tensor:
+                x = x.contiguous()
+                return x + x
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.contiguous})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv2d.py b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
new file mode 100644
index 000000000..fced0db3e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv2d.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestConv2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+            param("in_channel_padding_gt_4_lt_8", 1, in_channel=7),
+        ]
+    )
+    def test_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        in_channel=3,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    in_channel, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(1, in_channel, 224, 224).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv2d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
new file mode 100644
index 000000000..e47a74d88
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d.py
@@ -0,0 +1,204 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import unittest
+
+import torch
+
+from aitemplate.testing import detect_target
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class TestAitConv3d(AITTestCase):
+    @parameterized.expand(
+        [
+            param("conv3d", 3, bias=False),
+            param(
+                name="conv3d_tuple_parameters",
+                kernel_size=3,
+                stride=(4, 4, 4),
+                padding=(2, 2, 2),
+                dilation=2,
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_0",
+                kernel_size=3,
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_1",
+                kernel_size=3,
+                stride=(1, 1, 1),
+                padding=(1, 2, 2),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_2",
+                kernel_size=3,
+                stride=(2, 8, 8),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_3",
+                kernel_size=3,
+                stride=(1, 4, 4),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_4",
+                kernel_size=3,
+                stride=(1, 2, 2),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_mvit_5",
+                kernel_size=3,
+                stride=(1, 1, 1),
+                padding=(1, 1, 1),
+                dilation=(1, 1, 1),
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+            param(
+                name="conv3d_bias",
+                kernel_size=(3, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=8,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+            param(
+                name="conv3d_bias_ndhwc3to8",
+                kernel_size=(3, 5, 5),
+                stride=(2, 4, 4),
+                padding=(1, 2, 2),
+                dilation=1,
+                ci=3,
+                co=96,
+                groups=1,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+        ]
+    )
+    def test_conv3d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        ci=8,
+        co=8,
+        groups=1,
+        d=4,
+        h=224,
+        w=224,
+        bias=False,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv3d(
+                    ci,
+                    co,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    bias,
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(4, ci, d, h, w).cuda().half()]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv3d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
new file mode 100644
index 000000000..809d5abb4
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_conv3d_depthwise.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+
+import torch
+
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestAitDepthwiseConv3d(AITTestCase):
+    @parameterized.expand(
+        [
+            param(
+                name="depthwise_conv3d",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=56,
+                w=56,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_2",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=28,
+                w=28,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_3",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=2,
+                h=7,
+                w=7,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_4",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=3,
+                co=3,
+                groups=3,
+                d=4,
+                h=224,
+                w=224,
+                bias=True,
+            ),
+            param(
+                name="depthwise_conv3d_no_bias",
+                kernel_size=(1, 1, 1),
+                stride=(1, 1, 1),
+                padding=0,
+                dilation=1,
+                ci=96,
+                co=96,
+                groups=96,
+                d=4,
+                h=224,
+                w=224,
+                bias=False,
+            ),
+        ]
+    )
+    def test_depthwise_conv3d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        ci=8,
+        co=8,
+        groups=1,
+        d=4,
+        h=224,
+        w=224,
+        bias=False,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv3d(
+                    ci,
+                    co,
+                    kernel_size,
+                    stride,
+                    padding,
+                    dilation,
+                    groups,
+                    bias,
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(4, ci, d, h, w).cuda().half()]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv3d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
new file mode 100644
index 000000000..7e9972296
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_convtranspose2d.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestConvtTranspose2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 2, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+        ]
+    )
+    def test_convtranspose(
+        self,
+        name,
+        kernel_size,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    256,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv_transpose2d},
+        )
+
+    # only works when in_ch == out_ch
+    def test_convtranspose_multi_group(
+        self,
+        name="multi_group",
+        kernel_size=2,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=2,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    192,  # must to divisblce by 8
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.conv_transpose2d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_expand.py b/fx2ait/fx2ait/test/converters/test_ait_expand.py
new file mode 100644
index 000000000..cab5d3ca9
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_expand.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestExpandConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("same_shapes", [1, 2, 3], [1, 2, 3]),
+            param("infer_shapes", [1, 2, 3], [-1, -1, -1]),
+        ]
+    )
+    def test_expand(self, name, orig_shape, target_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                y = x.expand(target_shape)
+                return y * y
+
+        class TestModuleManyArgs(torch.nn.Module):
+            def forward(self, x):
+                y = x.expand(*target_shape)
+                return y * y
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(orig_shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.expand})
+
+        model_many_args = TestModuleManyArgs().cuda().half()
+        self.run_test(model_many_args, inputs, expected_ops={acc_ops.expand})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_flatten.py b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
new file mode 100644
index 000000000..405f65307
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_flatten.py
@@ -0,0 +1,35 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestFlattenConverter(AITTestCase):
+    @parameterized.expand(
+        [param("default"), param("start", start_dim=1), param("end", end_dim=2)]
+    )
+    def test_clamp(self, name, start_dim=0, end_dim=-1):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.flatten(x, start_dim=start_dim, end_dim=end_dim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.flatten})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_full.py b/fx2ait/fx2ait/test/converters/test_ait_full.py
new file mode 100644
index 000000000..438fc4d1f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_full.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestFullConverter(AITTestCase):
+    def test_new_full(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = x.new_full((2, 6), 2.2)
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.new_full})
+
+    def test_full_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = torch.full_like(x, 2.2)
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.full_like})
+
+    def test_new_ones(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                full = x.new_ones((2, 6))
+                return torch.cat([full, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.new_ones})
+
+    def test_ones_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                ones = torch.ones_like(x)
+                return torch.cat([ones, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.ones_like})
+
+    def test_new_zeros(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                zeros = x.new_zeros((2, 6))
+                return torch.cat([zeros, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.cat, acc_ops.new_zeros})
+
+    def test_zeros_like(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                zeros = torch.zeros_like(x)
+                return torch.cat([zeros, x], dim=1)
+
+        model = TestModule().cuda().half()
+        input = [torch.randn([2, 3]).cuda().half()]
+        self.run_test(model, input, expected_ops={acc_ops.zeros_like})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_gelu.py b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
new file mode 100644
index 000000000..444d885d2
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_gelu.py
@@ -0,0 +1,60 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestGeluConverter(AITTestCase):
+    def test_gelu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.gelu(x)
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
+
+    def test_fast_gelu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.gelu(x, approximate="tanh")
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
+
+    @parameterized.expand(
+        [
+            ("none"),
+            ("tanh"),
+        ]
+    )
+    def test_gelu_module(self, name):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gelu = torch.nn.GELU(approximate=name)
+
+            def forward(self, x):
+                return self.gelu(x)
+
+        inputs = [torch.randn(3, 10, 20).cuda().half()]
+        model = TestModule().cuda().half()
+
+        self.run_test(model, inputs, expected_ops={acc_ops.gelu})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_group_norm.py b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
new file mode 100644
index 000000000..0b3534444
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_group_norm.py
@@ -0,0 +1,43 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestGroupNormTensor(AITTestCase):
+    @parameterized.expand(
+        [
+            [True],
+            [False],
+        ]
+    )
+    def test_group_norm(self, affine):
+        class GN(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.gn = torch.nn.GroupNorm(3, 6, affine=affine)
+
+            def forward(self, x):
+                return self.gn(x)
+
+        mod = GN().half().cuda()
+        inputs = [torch.randn(2, 6, 4, 5).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
new file mode 100644
index 000000000..2a109bc19
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_layer_norm.py
@@ -0,0 +1,60 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+from torch import nn
+
+
+class TestLayernormConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [2, 10], 1e-5),
+            param("1d_normalized_shape", [100], [20, 100], 1e-6),
+            # Enable test case once layernorm support expand
+            # param("2d_normalized_shape", [5, 10], [5, 10]),
+        ]
+    )
+    def test_layer_norm(self, name, normalized_shape, input_shape, eps):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=eps)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.layer_norm})
+
+    def test_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([10, 10]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.layer_norm})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
new file mode 100644
index 000000000..9250099d9
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_leaky_relu.py
@@ -0,0 +1,32 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestLeakyReluConverter(AITTestCase):
+    def test_leaky_relu(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.leaky_relu(x, negative_slope=0.05)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.leaky_relu})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
new file mode 100644
index 000000000..20bc7a0ff
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_linalg_norm.py
@@ -0,0 +1,80 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestLinalgConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "l2_norm_dim_3",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=3,
+                keepdims=False,
+            ),
+            param(
+                "l2_norm_dim_2",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=2,
+                keepdims=False,
+            ),
+            param(
+                "l2_norm_dim_1",
+                input_shape=[1, 100, 40, 40],
+                ord=2,
+                dim=1,
+                keepdims=True,
+            ),
+            param(
+                "vector_norm_dim_3",
+                input_shape=[1, 100, 40, 40],
+                dim=3,
+                keepdims=False,
+            ),
+            param(
+                "vector_norm_dim_2",
+                input_shape=[1, 100, 40, 40],
+                dim=2,
+                keepdims=False,
+            ),
+            param(
+                "vector_norm_dim_1",
+                input_shape=[1, 100, 40, 40],
+                dim=-1,
+                keepdims=True,
+            ),
+        ]
+    )
+    def test_linalg_norm(
+        self, test_name, input_shape, ord=None, dim=None, keepdims=False
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self, ord, dim, keepdims):
+                super().__init__()
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.linalg.norm(x, ord, dim, keepdims)
+
+        model = TestModule(ord, dim, keepdims).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.linalg_norm})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_linear.py b/fx2ait/fx2ait/test/converters/test_ait_linear.py
new file mode 100644
index 000000000..1fed2bbef
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_linear.py
@@ -0,0 +1,48 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestLinearConverter(AITTestCase):
+    def test_linear(self):
+        M = 2
+        N = 4
+        K = 8
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                w = torch.randn(N, K).half().cuda()
+                b = torch.randn(N).half().cuda()
+                return torch.nn.functional.linear(x, w, b)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(M, K).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.linear})
+
+    def test_linear_no_bias(self):
+        M = 2
+        N = 4
+        K = 8
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                w = torch.randn(N, K).half().cuda()
+                return torch.nn.functional.linear(x, w, bias=None)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(M, K).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.linear})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_masked_select.py b/fx2ait/fx2ait/test/converters/test_ait_masked_select.py
new file mode 100644
index 000000000..4b606194a
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_masked_select.py
@@ -0,0 +1,38 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestMaskedSelectConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("random", torch.randn(5, 10), torch.randn(5, 10)),
+            param("all_neg", torch.zeros(5, 10), torch.ones(5, 10)),
+            param("all_pos", torch.ones(5, 10), torch.zeros(5, 10)),
+        ]
+    )
+    def test_masked_select(self, _, a, b):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+                return torch.masked_select(input=x, mask=mask)
+
+        model = TestModule().eval().half().cuda()
+        boolTensor = a > b
+
+        inputs = [torch.randn(5, 10).half().cuda(), boolTensor.cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.masked_select})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_matmul.py b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
new file mode 100644
index 000000000..95f65c961
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_matmul.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestMatMulConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3], [3, 4]],
+            [[2, 3, 4], [2, 4, 6]],
+            [[2, 3, 4], [4, 6]],
+            [[3, 4], [5, 4, 6]],
+            [[2, 2, 2, 3, 4], [4, 6]],
+        ]
+    )
+    def test_simple(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    def test_mm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.mm(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+            torch.randn(3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    @parameterized.expand(
+        [
+            [[1, 2, 3], [1, 3, 4]],
+            [[3, 2, 3], [3, 3, 4]],
+        ]
+    )
+    def test_bmm(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.bmm(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    @parameterized.expand(
+        [
+            [[1, 1, 3, 4], [1, 1, 4, 6]],
+            [[1, 2, 3, 4], [1, 2, 4, 6]],
+            [[4, 1, 3, 4], [4, 1, 4, 6]],
+            [[4, 2, 3, 4], [4, 2, 4, 6]],
+        ]
+    )
+    def test_matmul_with_4d_tensors(self, lhs_shape, rhs_shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.matmul})
+
+    def test_reshape_bmm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = torch.reshape(x, [-1, 3, 4])
+                y = torch.reshape(y, [-1, 4, 6])
+                return torch.bmm(x, y)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[2, 3, 4], [2, 4, 6]],
+            inputs_max=[[20, 3, 4], [20, 4, 6]],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.matmul}
+        )
+
+    def test_reshape_4d_bmm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = torch.reshape(x, [-1, 1, 3, 4])
+                y = torch.reshape(y, [-1, 1, 4, 6])
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[2, 3, 4], [2, 4, 6]],
+            inputs_max=[[20, 3, 4], [20, 4, 6]],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.matmul}
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
new file mode 100644
index 000000000..81ac49e4c
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool2d.py
@@ -0,0 +1,45 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool2dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.max_pool2d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py b/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py
new file mode 100644
index 000000000..f598bf2b0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_max_pool3d.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool3dConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((1, 3, 3), (1, 2, 2), (0, 1, 1)),
+        ]
+    )
+    def test_avgpool3d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool3d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 8, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={acc_ops.max_pool3d},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_nan2num.py b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
new file mode 100644
index 000000000..d2d1c12dc
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_nan2num.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestNan2NumConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14])
+            .half()
+            .cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.nan_to_num})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py b/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py
new file mode 100644
index 000000000..c1c816a6a
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_pooling_ops.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+import torch.nn.functional as F
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+
+
+class TestAitPoolingConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            "avg_pool2d",
+            "max_pool2d",
+        ]
+    )
+    def test_pooling2d_with_default_inputs(self, opname):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fn = getattr(F, opname)
+
+            def forward(self, x):
+                return self.fn(x, 2)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={getattr(acc_ops, opname)},
+        )
+
+    @parameterized.expand(
+        [
+            "max_pool3d",
+        ]
+    )
+    def test_pooling3d_with_default_inputs(self, opname):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fn = getattr(F, opname)
+
+            def forward(self, x):
+                return self.fn(x, 1)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 4, 8, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={getattr(acc_ops, opname)},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_pow.py b/fx2ait/fx2ait/test/converters/test_ait_pow.py
new file mode 100644
index 000000000..a6f75b28b
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_pow.py
@@ -0,0 +1,31 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestPowConverter(AITTestCase):
+    @parameterized.expand([("int", 3), ("float", 0.25)])
+    def test_pow(self, _, exp):
+        class Pow(torch.nn.Module):
+            def forward(self, x: torch.Tensor):
+                return torch.pow(x, exp)
+
+        model = Pow().half().cuda()
+        input = [torch.randn(3, 3).half().cuda()]
+        self.run_test(model, input, expected_ops={acc_ops.pow})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reduce.py b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
new file mode 100644
index 000000000..009b7cfa4
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_reduce.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestSumConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+        ]
+    )
+    def test_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sum})
+
+    def test_sum_no_dim(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x + torch.sum(x)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sum})
+
+    @parameterized.expand(
+        [
+            ["default", None, False],
+            ["specified_dims", (0, 1, 2), False],
+        ]
+    )
+    def test_sum_multi_dims(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return y + torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(2, 3, 5).half().cuda()] * 2
+        self.run_test(model, inputs, expected_ops={acc_ops.add, acc_ops.sum})
+
+
+class TestMeanConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+        ]
+    )
+    def test_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.mean})
+
+    @parameterized.expand(
+        [
+            ["none", None, False],
+            ["specified_dims", (0, 1, 2), False],
+        ]
+    )
+    def test_mean_multi_dims(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return y + torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(2, 3, 5).half().cuda() + 1] * 2
+        self.run_test(model, inputs, expected_ops={acc_ops.mean})
+
+    @parameterized.expand(
+        [
+            ["keepdim_false", (1,), False],
+            ["keepdim_true", (1,), True],
+            ["keepdim_false", (0,), False],
+            ["keepdim_true", (2,), True],
+        ]
+    )
+    # std is a combo of basic binary and mean ops
+    def test_std(self, name, dim, keepdim) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, input: torch.Tensor) -> torch.Tensor:
+                return torch.std(input, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        self.run_test(
+            model,
+            [torch.randn(2, 3, 4).half().cuda()],
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_reshape.py b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
new file mode 100644
index 000000000..28cf42287
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_reshape.py
@@ -0,0 +1,202 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestReshapeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3, 4], [6, 4]],
+            [[2, 3, 4], [2, 12]],
+            [[2, 3, 4], [24]],
+            [[2, 3, 4], [-1, 4]],
+            [[2, 3, 4], [2, -1]],
+            [[2, 3, 4], [-1]],
+        ]
+    )
+    def test_simple(self, original_shape: List[int], final_shape: List[int]) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, final_shape)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*original_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.reshape})
+
+    def test_with_getitem_size(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                d0 = y.size(dim=0)
+                d1 = y.size(dim=1)
+                return x.reshape(d0, d1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(6, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_reshape_with_non_int_param(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=1)
+                return x.reshape(d0 * 8)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 4, 4).half().cuda(),
+            torch.randn(4, 8).half().cuda(),
+        ]
+        self.run_test(
+            model, inputs, expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem}
+        )
+
+    def test_with_getitem_reshape_dim0_dynamic(self) -> None:
+        class TestSimpleModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestSimpleModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+        class TestComplexModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * (d2 + d1 - 3)  # d2+d1-3=d2
+                return x.reshape(d0, d)
+
+        model = TestComplexModule().cuda()
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+    def test_with_getitem_reshape_dim01_dynamic(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * d2
+                return x.reshape(d0, d)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 30, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+        class TestComplexModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                d0 = x.size(dim=0)
+                d1 = x.size(dim=1)
+                d2 = x.size(dim=2)
+                d = d1 * (d2 - d0 + d0)
+                return x.reshape(d0, d)
+
+        model = TestComplexModule().cuda()
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.reshape, acc_ops.size, acc_ops.getitem},
+        )
+
+    def test_fx2ait_lower_shapes_duped(self):
+        class TestMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                concat = torch.concat((x, x, x))
+                reshape_0 = concat.reshape((-1, 3))
+                a = x.size(0) * 3
+                reshape = concat.reshape(-1, a)
+                reshape_0 = reshape_0.reshape(-1, a)
+                return reshape + reshape_0
+
+        inputs_spec = TensorSpec.from_input_list_with_batch_size(
+            inputs=[torch.randn(3, 4).half()], max_batch_size=8
+        )
+        model = TestMod().cuda().half()
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
new file mode 100644
index 000000000..bd61e7e3c
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_sigmoid.py
@@ -0,0 +1,29 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from torch import nn
+
+
+class TestSigmoidConverter(AITTestCase):
+    def test_sigmoid(self):
+        class Sigmoid(nn.Module):
+            def forward(self, x):
+                return torch.sigmoid(x)
+
+        model = Sigmoid().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.sigmoid})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
new file mode 100644
index 000000000..8236e5077
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_slice_tensor.py
@@ -0,0 +1,129 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestSliceTensor(AITTestCase):
+    @parameterized.expand(
+        [
+            ("integer_slice", 1),
+            ("slice_batch_dim", slice(None, None, None)),
+            ("slice_basic", (slice(None, None, None), slice(0, 3, 1))),
+            ("slice_full", (slice(None, None, None), slice(0, 10, 1))),
+            ("ellipsis", (slice(None, None, None), ..., slice(0, 3, 1))),
+            (
+                "slice_all_none",
+                (slice(None, None, None), slice(None, None, None)),
+            ),
+            (
+                "slice_start_none",
+                (slice(None, None, None), slice(None, 2, 1)),
+            ),
+            ("slice_end_none", (slice(None, None, None), slice(1, None, 1))),
+            (
+                "slice_step_none",
+                (slice(None, None, None), slice(0, 3, None)),
+            ),
+            ("slice_neg_idx", (slice(None, None, None), -1)),
+            ("slice_neg_slice", (slice(None, None, None), slice(-8, -2, 1))),
+            ("multi_dim", (slice(None, None, None), 0, 1)),
+            (
+                "slice_multi_dim",
+                (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
+            ),
+            ("none", (slice(None, None, None), None, slice(1, -1, 1), 1)),
+            (
+                "unsqueeze_inner_dim_twice",
+                (
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    slice(None, None, None),
+                    None,
+                    None,
+                ),
+            ),
+            ("with_squeeze", (slice(None, None, None), 1, slice(1, -1, 1), None)),
+            (
+                "slice_zero_slice",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+            ),
+            (
+                "slice_start_seq_slice",
+                (slice(0, 1, None), [0, 1, 2], slice(0, 10, None)),
+            ),
+            (
+                "slice_end_seq_slice",
+                (slice(0, 1, None), [0, 6, 7, 8, 9], slice(0, 10, None)),
+            ),
+            (
+                "slice_long_seq_slice",
+                (slice(0, 1, None), [0, 5, 6, 7, 2, 3, 4, 5], slice(0, 10, None)),
+            ),
+            (
+                "slice_list_slice",
+                (slice(0, 1, None), [2], slice(0, 10, None)),
+            ),
+            (
+                "zero_list_zero",
+                (slice(0, 1, None), [0, 7, 5, 3, 1, 9], slice(0, 0, None)),
+            ),
+            (
+                "all_list_all",
+                (slice(None, None, None), [2, 2, 2, 2], slice(None, None, None)),
+            ),
+            (
+                "slice_zero_list",
+                (slice(0, 1, None), slice(0, 0, None), [0, 1, 3]),
+            ),
+        ]
+    )
+    def test_slice_tensor(self, name, idx):
+        class SliceTensor(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                y = x + x
+                return y[self.idx]
+
+        mod = SliceTensor(idx).half().cuda()
+        inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
+        self.run_test(mod, inputs, expected_ops={acc_ops.getitem})
+
+    @parameterized.expand([("default", 1), ("neg", -2)])
+    def test_get_item(self, _, idx):
+        class GetItem(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                shape = x.shape[1:]
+                y = torch.nn.functional.layer_norm(x, shape, eps=1e-5)
+                return y
+
+        mod = GetItem(idx).half().cuda()
+        inputs = [torch.randn(2, 10).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={acc_ops.getitem, acc_ops.size, acc_ops.layer_norm},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_softmax.py b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
new file mode 100644
index 000000000..d1171f852
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_softmax.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestSoftmaxConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=1),
+            param("neg", dim=-1),
+        ]
+    )
+    def test_softmax(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    def test_softmax_not_last_dim(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        # Test static use case
+        inputs = [
+            torch.randn(2, 3, 5, 1, 1).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+        # Test dynamic use case
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 5, 1, 1],
+            ],
+            inputs_max=[
+                [20, 10, 5, 1, 1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={acc_ops.softmax},
+        )
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    def test_softmax_expected_failure(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        inputs = [
+            torch.randn(2, 3, 5, 2, 1).half().cuda(),
+        ]
+        with self.assertRaises(ValueError):
+            self.run_test(model, inputs, expected_ops={acc_ops.softmax})
+
+    @parameterized.expand(
+        [
+            param("default", dim=2),
+            param("neg", dim=-3),
+        ]
+    )
+    def test_softmax_expected_failure_dynamic(self, name, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nn.functional.softmax(x, dim=dim)
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 5, 2, 1],
+            ],
+            inputs_max=[
+                [20, 10, 5, 4, 1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        with self.assertRaises(ValueError):
+            self.run_test_with_dynamic_shape(
+                model,
+                inputs_spec,
+                expected_ops={acc_ops.softmax},
+            )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_split.py b/fx2ait/fx2ait/test/converters/test_ait_split.py
new file mode 100644
index 000000000..ab0d81032
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_split.py
@@ -0,0 +1,126 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List, Union
+
+import torch
+from fx2ait.acc_tracer import ait_acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestSplitConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[2, 10], [2, 3, 5]],
+            [[2, 10], 2],
+            [[2, 10], 3],
+        ]
+    )
+    def test_with_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections, dim=1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[10], [2, 3, 5]],
+            [[10], 2],
+            [[10], 3],
+        ]
+    )
+    def test_without_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, split_size_or_sections)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[2, 10], [2, 3, 5]],
+            [[2, 10], 2],
+            [[2, 10], 3],
+        ]
+    )
+    def test_tensor_split_with_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.split(split_size_or_sections, dim=1)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    @parameterized.expand(
+        [
+            [[10], [2, 3, 5]],
+            [[10], 2],
+            [[10], 3],
+        ]
+    )
+    def test_tensor_split_without_dim(
+        self, input_shape: List[int], split_size_or_sections: Union[int, List[int]]
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.split(split_size_or_sections)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={ait_acc_ops.split})
+
+    def test_with_dim_dynamic_shape(self) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.split(x, 2, dim=1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 10],
+            ],
+            inputs_max=[
+                [20, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={ait_acc_ops.split}
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_squeeze.py b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
new file mode 100644
index 000000000..e0f87a95e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_squeeze.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestSqueezeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=None, shape=[2, 1, 1, 3]),
+            param("1", dim=1, shape=[2, 1, 1, 3]),
+            param("-1", dim=-1, shape=[2, 1, 3, 1]),
+        ]
+    )
+    def test_squeeze(self, name, dim, shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.squeeze})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_tile.py b/fx2ait/fx2ait/test/converters/test_ait_tile.py
new file mode 100644
index 000000000..40840671e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_tile.py
@@ -0,0 +1,49 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestTile(AITTestCase):
+    @parameterized.expand(
+        [
+            ("same_num_dims", (2, 2, 3), (1, 2, 2)),
+            (
+                "less_dims",
+                (2, 2, 3),
+                (
+                    1,
+                    2,
+                ),
+            ),
+            ("more_dims", (2, 3), (1, 2, 2, 1)),
+        ]
+    )
+    def test_tile(self, _, input_shape, dims):
+        class Tile(nn.Module):
+            def __init__(self, dims):
+                super().__init__()
+                self.dims = dims
+
+            def forward(self, x):
+                x = x + x  # avoid input shape infer error from AIT
+                return torch.tile(x, self.dims)
+
+        model = Tile(dims).half().cuda()
+        inputs = [torch.randn(*input_shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={acc_ops.add, acc_ops.tile})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_topk.py b/fx2ait/fx2ait/test/converters/test_ait_topk.py
new file mode 100644
index 000000000..be04e214b
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_topk.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestTopkConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            [[4], 1],
+            [[6], 3],
+            [[6], 6],
+        ]
+    )
+    def test_simple(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.topk(x, k)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    @parameterized.expand(
+        [
+            [[2, 4], 1],
+            [[2, 4], 2],
+            [[3, 3], 3],
+        ]
+    )
+    def test_multi_dimensional(self, input: List[int], k: int) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                values, indices = torch.topk(x, k)
+                return indices
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*input).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.topk})
+
+    ##TODO results mismatch.(P537992074)
+    # def test_multi_dimensional_dynamic_shape(self) -> None:
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             values, indices = torch.topk(x, 1)
+    #             return indices
+
+    #     model = TestModule().cuda()
+    #     inputs = [
+    #         [
+    #             torch.randn((2, 4)).half().cuda(),
+    #         ],
+    #         [
+    #             torch.randn((20, 4)).half().cuda(),
+    #         ],
+    #     ]
+    #     self.run_test_with_dynamic_shape(model, inputs, expected_ops={acc_ops.topk})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
new file mode 100644
index 000000000..672fecd30
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unary_ops.py
@@ -0,0 +1,125 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import math
+from typing import Callable, Dict, Set
+
+import torch
+from aitemplate.testing.test_utils import filter_test_cases_by_params, TestEnv
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import (
+    AITTestCase,
+    lower_precision_to_torch_type,
+    LowerPrecision,
+)
+from parameterized import parameterized
+
+
+unary_ops = [
+    (torch.abs, acc_ops.abs),
+    (torch.sign, acc_ops.sign),
+    (torch.log, acc_ops.log),
+    (torch.relu, acc_ops.relu),
+    (torch.sin, acc_ops.sin),
+    (torch.cos, acc_ops.cos),
+    (torch.sqrt, acc_ops.sqrt),
+    (torch.clone, acc_ops.clone),
+    (torch.neg, acc_ops.neg),
+    (torch.exp, acc_ops.exp),
+]
+
+TestEnvToPrecision: Dict[TestEnv, Set[LowerPrecision]] = {
+    TestEnv.CUDA_LESS_THAN_SM80: [LowerPrecision.FP16, LowerPrecision.FP32],
+    TestEnv.CUDA_SM80: [LowerPrecision.BF16],
+    TestEnv.ROCM: [LowerPrecision.FP16],
+}
+
+
+class TestUnaryOpsConverter(AITTestCase):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                env: [
+                    (
+                        f"{env}_{op[0].__name__}_{precision.value}",
+                        op[0],
+                        op[1],
+                        precision,
+                    )
+                    for op, precision in itertools.product(unary_ops, precisions)
+                ]
+                for env, precisions in TestEnvToPrecision.items()
+            }
+        )
+    )
+    def test_unary_ops(
+        self, name: str, orig_op: Callable, expected_op, precision: LowerPrecision
+    ):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x) * 2.0
+
+        torch_dtype = lower_precision_to_torch_type(precision)
+        model = TestModule().cuda().to(torch_dtype)
+        inputs = [
+            torch.randn(1, 2, 3).cuda().to(torch_dtype),
+        ]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={expected_op} if expected_op is not None else {},
+            precision=precision,
+        )
+
+    def test_sqrt(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self, x):
+                super().__init__()
+                self.x = x
+
+            def forward(self, y):
+                return torch.div(y, math.sqrt(self.x))
+
+        model = TestModule(x=64).cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={})
+
+    def test_to(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y):
+                return y.to(dtype=torch.float16)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.to_dtype})
+
+    def test_contiguous(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y):
+                return y.contiguous()
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.contiguous})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unbind.py b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
new file mode 100644
index 000000000..a10d27c53
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unbind.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestUnbindTensor(AITTestCase):
+    @parameterized.expand(
+        [
+            ("positive_dim", 2),
+            ("negative_dim", -1),
+        ]
+    )
+    def test_unbind(self, name, dim):
+        class GetItem(nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                y = torch.unbind(x, dim=dim)
+                z = y[0]
+                return z
+
+        mod = GetItem().half().cuda()
+        inputs = [torch.randn(2, 3, 4).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops={},
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py b/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py
new file mode 100644
index 000000000..bfdda4cfe
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_unsqueeze.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+from parameterized import parameterized
+
+
+class TestUnsqueezeConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            ["default", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_simple(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={acc_ops.unsqueeze})
+
+    def test_simple_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.unsqueeze(x, 1)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={acc_ops.unsqueeze}
+        )
diff --git a/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py b/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py
new file mode 100644
index 000000000..20b506fc0
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_upsampling2d.py
@@ -0,0 +1,42 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestInterpolateConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param(scale_factor=1, mode="nearest"),
+            param(scale_factor=2, mode="nearest"),
+            param(scale_factor=2, mode="bilinear"),
+        ]
+    )
+    def test_interpolate(self, scale_factor, mode):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                x = torch.nn.functional.interpolate(
+                    y, scale_factor=scale_factor, mode=mode
+                )
+                return x
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([2, 8, 16, 16]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.interpolate})
diff --git a/fx2ait/fx2ait/test/converters/test_ait_var.py b/fx2ait/fx2ait/test/converters/test_ait_var.py
new file mode 100644
index 000000000..757b0fa04
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters/test_ait_var.py
@@ -0,0 +1,60 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.acc_tracer import acc_ops
+from fx2ait.tools.common_fx2ait import AITTestCase
+from parameterized import param, parameterized
+
+
+class TestVarConverter(AITTestCase):
+    @parameterized.expand(
+        [
+            param("default", dim=0, unbiased=False),
+            param("unbiased", dim=0, unbiased=True),
+            param("neg_dim", dim=-1, unbiased=True),
+            param("keepdim", dim=0, unbiased=True, keepdim=True),
+        ]
+    )
+    def test_var(self, name, dim, unbiased, keepdim=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.var(x, dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.var})
+
+    @parameterized.expand(
+        [
+            param("default", dim=0, unbiased=False),
+            param("unbiased", dim=0, unbiased=True),
+            param("neg_dim", dim=-1, unbiased=True),
+            param("keepdim", dim=0, unbiased=True, keepdim=True),
+        ]
+    )
+    def test_var_call_method(self, name, dim, unbiased, keepdim=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return x.var(dim=dim, unbiased=unbiased, keepdim=keepdim)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={acc_ops.var})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
new file mode 100644
index 000000000..2d1e8a635
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_adaptive_avg_pool2d_aten.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+#!/usr/bin/env fbpython
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestAdaptiveAvgPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ((64, 64), torch.ops.aten._adaptive_avg_pool2d.default),
+            ((128, 128), torch.ops.aten._adaptive_avg_pool2d.default),
+            (64, torch.ops.aten._adaptive_avg_pool2d.default),
+            (
+                (1, 1),
+                torch.ops.aten.mean.dim,
+            ),
+        ]
+    )
+    def test_adaptive_avgpool2d(self, output_size, op_check):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 32, 256, 256).cuda().half()]
+        if op_check == torch.ops.aten.mean.dim:
+            permute_inputs = None
+            permute_outputs = None
+        else:
+            permute_inputs = [0, 2, 3, 1]
+            permute_outputs = [0, 3, 1, 2]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={op_check},
+            permute_inputs=permute_inputs,
+            permute_outputs=permute_outputs,
+        )
+
+    @parameterized.expand(
+        [
+            ((64, 64),),
+            ((128, 128),),
+            (64,),
+        ]
+    )
+    def test_dynamic_adaptive_avgpool2d(
+        self,
+        output_size,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.pool = torch.nn.AdaptiveAvgPool2d(output_size)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule().half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 32, 256, 256],
+            ],
+            inputs_max=[
+                [10, 32, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten._adaptive_avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
new file mode 100644
index 000000000..681f8b407
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_avg_pool2d_aten.py
@@ -0,0 +1,83 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestAvgPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_dynamic_avgpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.AvgPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 4, 256, 256],
+            ],
+            inputs_max=[
+                [10, 4, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.avg_pool2d.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
new file mode 100644
index 000000000..6450643fb
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_batch_norm_aten.py
@@ -0,0 +1,94 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestAdaptiveAvgPool2dConverter(DispatchTestCase):
+    def test_batch_norm(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                y = self.bn(x)
+                y = y.mul(1)
+                return y
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    def test_batch_norm_2layers(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+                self.bn2 = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                y = self.bn(x)
+                y = y.mul(1)
+                y = self.bn2(y)
+                return y
+
+        model = TestModule().half().cuda()
+        inputs = [torch.randn(1, 3, 244, 244).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    def test_dynamic_batch_norm(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bn = torch.nn.BatchNorm2d(3)
+
+            def forward(self, x):
+                return self.bn(x)
+
+        model = TestModule().half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 3, 244, 244],
+            ],
+            inputs_max=[
+                [10, 3, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.batch_norm},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
new file mode 100644
index 000000000..d2a9f7611
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_binary_op_aten.py
@@ -0,0 +1,129 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import operator
+from typing import Callable, List, Tuple
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+TWO_TENSOR_INPUTS = [
+    (torch.randn(2, 3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(3, 4), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(3, 4)),
+    (torch.randn(1, 1, 1), torch.randn(2, 3, 4)),
+    (torch.randn(1), torch.randn(2, 3, 4)),
+    (torch.randn(2, 3, 4), torch.randn(1)),
+    (torch.randn(2, 3, 4), torch.randn(1, 1, 1)),
+    (torch.randn(1, 3, 4), torch.randn(5, 1, 4)),
+]
+
+
+class TestATenBinaryOpConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            [
+                "add",
+                operator.add,
+                torch.ops.aten.add.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "sub",
+                operator.sub,
+                torch.ops.aten.sub.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "mul",
+                operator.mul,
+                torch.ops.aten.mul.Tensor,
+                TWO_TENSOR_INPUTS,
+            ],
+            [
+                "div",
+                operator.truediv,
+                torch.ops.aten.div.Tensor,
+                [(lhs, rhs.clamp(min=0.01)) for lhs, rhs in TWO_TENSOR_INPUTS],
+            ],
+        ]
+    )
+    def test_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        aten_op: Callable,
+        inputs: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        for lhs, rhs in inputs:
+            model = TestModule().cuda()
+            lhs = lhs.half().cuda()
+            rhs = rhs.half().cuda()
+            self.run_test(model, [lhs, rhs], expected_ops={aten_op})
+
+    @parameterized.expand(
+        [
+            [
+                "dynamic_add",
+                operator.add,
+                torch.ops.aten.add.Tensor,
+            ],
+            [
+                "dynamic_sub",
+                operator.sub,
+                torch.ops.aten.sub.Tensor,
+            ],
+            [
+                "dynamic_sub",
+                operator.mul,
+                torch.ops.aten.mul.Tensor,
+            ],
+            [
+                "dynamic_div",
+                operator.truediv,
+                torch.ops.aten.div.Tensor,
+            ],
+        ]
+    )
+    def test_dynamic_two_tensors(
+        self,
+        name: str,
+        op: Callable,
+        aten_op: Callable,
+    ) -> None:
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return op(x, y)
+
+        m = TensorSpec.gen_int_var_min_max(1, 32, "dynamic_m")
+        n = TensorSpec.gen_int_var_min_max(3, 1024, "dynamic_n")
+        k = TensorSpec.gen_int_var_min_max(4, 2048, "dynamic_k")
+        model = TestModule().cuda().half()
+        # AIT can automatically calculate broadcast
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [[m, n, k], [n, k]], dtype_list=[torch.float16] * 2
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            input_spec,
+            expected_ops={aten_op},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
new file mode 100644
index 000000000..b971f735b
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_cat_aten.py
@@ -0,0 +1,82 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestCatConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", 0],
+            ["positive_dim", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_cat(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return torch.cat([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+            torch.randn(2, 3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.cat.default})
+
+    @parameterized.expand(
+        [
+            ["default", 0],
+            ["positive_dim", 1],
+            ["negative_dim", -1],
+        ]
+    )
+    def test_cat_dynamic_shape(self, name: str, dim: int):
+        class TestModule(torch.nn.Module):
+            def forward(
+                self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor
+            ) -> torch.Tensor:
+                return torch.cat([x, y, z], dim=dim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+                [2, 3, 4],
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 3, 4],
+                [20, 3, 4],
+                [20, 3, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.cat.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
new file mode 100644
index 000000000..3acdb6ad6
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_chunk_aten.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.fx2ait import TensorSpec
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_chunk
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestChunkConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 2, [3, 10, 2], 1),
+            param("no_dim", 2, [3, 10, 2]),
+            param("neg_dim", 1, [3, 10, 2], -2),
+            param("chunk_bigger_than_dim", 4, [2, 10, 2], 2),
+        ]
+    )
+    def test_chunk(self, name, chunks, shape, dim=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = (
+                    torch.chunk(x, chunks=chunks, dim=dim)
+                    if dim is not None
+                    else torch.chunk(x, chunks=chunks)
+                )
+                # For AIT, all chunk results must be used
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={aten_compose_chunk})
+
+    def test_chunk_dynamic(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                x = torch.chunk(x, chunks=2, dim=1)
+                return x[0]
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[20, 10, 8]],
+            inputs_max=[[50, 10, 8]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={aten_compose_chunk}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
new file mode 100644
index 000000000..31d74189e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_clamp_aten.py
@@ -0,0 +1,80 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestClampConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.clamp.default})
+
+    @parameterized.expand(
+        [
+            param("default", min=-1, max=0, use_clamp=True),
+            param("min", min=0.5, use_clamp=False),
+            param("max", max=0.5, use_clamp=True),
+            param("minBiggerThanMax", min=1, max=0, use_clamp=False),
+        ]
+    )
+    def test_dynamic_clamp(self, name, min=None, max=None, use_clamp=True):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.clamp if use_clamp else torch.clip
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x, min=min, max=max)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.clamp.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
new file mode 100644
index 000000000..b15dae226
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_conv2d_aten.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import torch
+from aitemplate.compiler.public import DynamicProfileStrategy
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestConv2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+        ]
+    )
+    def test_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(1, 3, 24, 24).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={
+                torch.ops.aten.convolution.default,
+                torch.ops.aten.relu.default,
+            },
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 1, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+            param("dilation", 1, dilation=2),
+            param("multi_group", 1, 1, 1, 1, 3, bias=True),
+        ]
+    )
+    def test_dynamic_conv2d(
+        self,
+        name,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(
+                    3, 36, kernel_size, stride, padding, dilation, groups, bias
+                )
+                self.relu = torch.nn.ReLU()
+
+            def forward(self, x):
+                return self.relu(self.conv(x))
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 24, 24],
+            ],
+            inputs_max=[
+                [32, 3, 24, 24],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={
+                torch.ops.aten.convolution.default,
+                torch.ops.aten.relu.default,
+            },
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+            # AIT conv2d only support HINTS as dyanmic profiliing strategy.
+            dynamic_profile_strategy=DynamicProfileStrategy.HINTS,
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
new file mode 100644
index 000000000..50c13008e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_convtranspose2d_aten.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestConvtTranspose2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", 1),
+            param("no_bias", 2, bias=False),
+            param("tuple_parameters", 1, (1, 1), (1, 1)),
+            param("non_zero_padding", 1, padding=1),
+            param("non_unary_params", 3, 2, padding=1, bias=False),
+        ]
+    )
+    def test_convtranspose(
+        self,
+        name,
+        kernel_size,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=1,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    256,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.convolution.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    # # only works when in_ch == out_ch
+    def test_convtranspose_multi_group(
+        self,
+        name="multi_group",
+        kernel_size=2,
+        stride=2,
+        padding=0,
+        dilation=1,  # only support dilation = 1
+        groups=2,
+        bias=True,
+    ):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.convtranspose = torch.nn.ConvTranspose2d(
+                    192,
+                    192,  # must to divisblce by 8
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    output_padding=0,
+                    groups=groups,
+                    bias=bias,
+                    dilation=dilation,
+                )
+
+            def forward(self, x):
+                return self.convtranspose(x)
+
+        model = TestModule().cuda().half().eval()
+        inputs = [torch.randn(1, 192, 28, 28).cuda().half()]
+        _ = model(*inputs)
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.convolution.default},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
new file mode 100644
index 000000000..fc215d96f
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_flatten_aten.py
@@ -0,0 +1,78 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+import torch.nn as nn
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestFlattenConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ("flatten_middle_dims", 1, 2),
+            ("flatten_last_3_dims", 1, 3),
+            ("flatten_all", 0, 3),
+        ]
+    )
+    def test_flatten(self, _, start_dim, end_dim):
+        class TestModule(nn.Module):
+            def __init__(self, start, end):
+                super().__init__()
+                self.start = start
+                self.end = end
+
+            def forward(self, x):
+                return torch.flatten(x, self.start, self.end)
+
+        model = TestModule(start_dim, end_dim).cuda().half()
+        inputs = (torch.randn(2, 3, 4, 5).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    @parameterized.expand(
+        [
+            ("flatten_middle_dims", 1, 2),
+            ("flatten_last_3_dims", 1, 3),
+        ]
+    )
+    def test_flatten_with_dynamic_shape(self, _, start_dim, end_dim):
+        class TestModule(nn.Module):
+            def __init__(self, start, end):
+                super().__init__()
+                self.start = start
+                self.end = end
+
+            def forward(self, x):
+                return torch.flatten(x, self.start, self.end)
+
+        model = TestModule(start_dim, end_dim).cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4, 5],
+            ],
+            inputs_max=[
+                [10, 20, 4, 5],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
new file mode 100644
index 000000000..f974ebbb3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_hardtanh_aten.py
@@ -0,0 +1,78 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestHardTanhConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default", min=-1.5, max=3),
+            param("min", min=-1.5),
+            param("max", max=3),
+        ]
+    )
+    def test_hardtanh(self, name, min=-1, max=1):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.nn.Hardtanh(min, max)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.hardtanh.default})
+
+    @parameterized.expand(
+        [
+            param("default", min=-1.2, max=2),
+            param("min", min=-1.2),
+            param("max", max=2),
+        ]
+    )
+    def test_dynamic_hardtanh(self, name, min=-1, max=1):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.op = torch.nn.Hardtanh(min, max)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.op(x)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.hardtanh.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
new file mode 100644
index 000000000..7a6036dd3
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_layer_norm_aten.py
@@ -0,0 +1,116 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+from torch import nn
+
+
+class TestLayernormConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [2, 10]),
+            param("1d_normalized_shape_3d_input", [10], [2, 6, 10]),
+            param("2d_normalized_shape", [6, 10], [2, 6, 10]),
+            # FIXME: Enable test case once layernorm support expand
+            # param("2d_normalized_shape", [5, 10], [5, 10]),
+        ]
+    )
+    def test_layer_norm(self, name, normalized_shape, input_shape):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+        inputs = [
+            torch.randn(input_shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.layer_norm.default})
+
+    def test_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([10, 10]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.layer_norm.default})
+
+    @parameterized.expand(
+        [
+            param("1d_normalized_shape", [10], [[2, 10], [12, 10]]),
+            param("1d_normalized_shape_3d_input", [10], [[2, 6, 10], [12, 20, 10]]),
+            param("2d_normalized_shape", [6, 10], [[2, 6, 10], [12, 6, 10]]),
+        ]
+    )
+    def test_dynamic_layer_norm(self, name, normalized_shape, input_shape):
+        class TestModule(torch.nn.Module):
+            def __init__(self, normalized_shape):
+                super().__init__()
+                # TODO remove hard code eps once layernorm api expose eps setting
+                self.mod = nn.LayerNorm(normalized_shape, eps=1e-5)
+
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return self.mod(x)
+
+        model = TestModule(normalized_shape).cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                input_shape[0],
+            ],
+            inputs_max=[
+                input_shape[1],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.layer_norm.default}
+        )
+
+    def test_dynamic_layer_norm_IntImm_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                shape = x.shape
+                normalized_shape = shape[1:]
+                return torch.nn.functional.layer_norm(x, normalized_shape, eps=1e-5)
+
+        model = TestModule().cuda().half()
+        inputs = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[10, 30]],
+            inputs_max=[[20, 30]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs, expected_ops={torch.ops.aten.layer_norm.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
new file mode 100644
index 000000000..618669f06
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_linear_aten.py
@@ -0,0 +1,88 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_mm_2d
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestLinearConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ("default", [1, 512], True, torch.ops.aten.linear),
+            ("matrix", [5, 512], True, torch.ops.aten.linear),
+            ("no_bias", [1, 512], False, torch.ops.aten.linear),
+            ("multi_dim_matrix", [4, 5, 512], True, torch.ops.aten.linear),
+            (
+                "multi_dim_matrix",
+                [4, 5, 512],
+                False,
+                aten_compose_mm_2d,
+            ),
+        ]
+    )
+    def test_linear(self, test_name, shape, bias, expected):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 256, bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(shape).half().cuda()]
+        self.run_test(model, inputs, expected_ops={expected})
+
+    @parameterized.expand(
+        [
+            ("default", [[1, 5], [512, 512]], True),
+            ("no_bias", [[1, 4], [512, 512]], False),
+            (
+                "multi_dim_matrix",
+                [[2, 4], [512, 512]],
+                True,
+            ),
+            (
+                "multi_dim_matrix_no_bias",
+                [[2, 4], [512, 512]],
+                False,
+            ),
+        ]
+    )
+    def test_dynamic_linear(self, test_name, shape, bias):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(512, 256, bias)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = TestModule().cuda().half()
+
+        input_shape = []
+        for i, s in enumerate(shape):
+            input_shape.append(
+                TensorSpec.gen_int_var_min_max(s[0], s[1], "dynamic" + str(i))
+            )
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [input_shape], dtype_list=[torch.float16]
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, input_spec, expected_ops={torch.ops.aten.linear}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
new file mode 100644
index 000000000..4819280d7
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_matmul_aten.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.passes.lower_basic_pass_aten import aten_compose_bmm_3d, aten_compose_mm_2d
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+from parameterized import parameterized
+
+
+class TestMatMulConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            [[2, 3], [3, 4], torch.ops.aten.mm.default],
+            # TODO check again in future since there is a diff about not decompose https://fburl.com/nysuuf7q
+            [
+                [2, 3, 4],
+                [4, 6],
+                aten_compose_mm_2d,
+            ],
+            [[2, 3, 4], [2, 4, 6], aten_compose_bmm_3d],
+            [[2, 2, 2, 3, 4], [4, 6], torch.ops.aten.mm.default],
+        ]
+    )
+    def test_simple(self, lhs_shape, rhs_shape, op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                x = x.mul(1)
+                y = y.mul(1)
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(*lhs_shape).half().cuda(),
+            torch.randn(*rhs_shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={op})
+
+    def test_mm(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.mm(x, y)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(2, 3).half().cuda(),
+            torch.randn(3, 4).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.mm.default})
+
+    @parameterized.expand(
+        [
+            # Only M can be dynamic: https://github.com/fairinternal/AITemplate/blob/main/tests/unittest/ops/test_gemm.py
+            [[[2, 3], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            [[[2, 3], [2, 3], [3, 3], [6, 6]], aten_compose_mm_2d],
+            # Cannot test with size=1, we will one specialize
+            # [[[1, 3], [2, 3], [6, 8], [3, 3], [6, 6]], torch.ops.aten.mm.default],
+            # FIXME: batch_size cannot be dynamic because the permutation of shape change the names: P544607056
+            # b, m, k, n
+            [[[2, 2], [6, 8], [3, 3], [6, 6]], aten_compose_bmm_3d, True],
+        ]
+    )
+    def test_dynamic(self, shape, op, bmm=False):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                return torch.matmul(x, y)
+
+        model = TestModule().cuda()
+
+        input0_shape = []
+        for i, s in enumerate(shape):
+            if i == len(shape) - 1:
+                break
+            input0_shape.append(
+                TensorSpec.gen_int_var_min_max(s[0], s[1], "dynamic" + str(i))
+            )
+        input1_shape = [input0_shape[-1]] + [
+            TensorSpec.gen_int_var_min_max(shape[-1][0], shape[-1][1], "dynamic_last")
+        ]
+        if bmm:
+            input1_shape = [input0_shape[0]] + input1_shape
+        input_spec = TensorSpec.create_spec_from_int_vars(
+            [input0_shape, input1_shape], dtype_list=[torch.float16] * 2
+        )
+        self.run_test_with_dynamic_shape(model, input_spec, expected_ops={op})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
new file mode 100644
index 000000000..a391617b1
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_max_pool2d_aten.py
@@ -0,0 +1,83 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestMaxPool2dConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_maxpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs = [torch.randn(1, 4, 256, 256).cuda().half()]
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={torch.ops.aten.max_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
+
+    @parameterized.expand(
+        [
+            (1, 1, 0),
+            ((2, 2), 2, 1),
+            ((4, 4), (4, 4), 0),
+        ]
+    )
+    def test_dynamic_maxpool2d(self, kernel_size, stride, padding):
+        class TestModule(torch.nn.Module):
+            def __init__(self, kernel_size, stride, padding):
+                super().__init__()
+                self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding)
+
+            def forward(self, x):
+                return self.pool(x)
+
+        model = TestModule(kernel_size, stride, padding).half().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 4, 224, 224],
+            ],
+            inputs_max=[
+                [10, 4, 256, 256],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.max_pool2d},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=[0, 3, 1, 2],
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
new file mode 100644
index 000000000..383d96e4e
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_model_aten.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+import torchvision
+from fx2ait.passes.lower_basic_pass_aten import nchw2nhwc_pass, replace_inplace_ops
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestModelConverter(DispatchTestCase):
+    def test_resnet50(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.resnet18()
+
+            def forward(self, x):
+                return self.mod(x)
+
+        model = TestModule().cuda().half()
+        inputs = [torch.randn(32, 3, 224, 224).half().cuda()]
+        customized_passes = [
+            replace_inplace_ops,
+            nchw2nhwc_pass,
+        ]
+
+        self.run_test(
+            model,
+            inputs,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=None,
+            customized_passes=customized_passes,
+        )
+
+    def test_densenet(self):
+        torch.manual_seed(0)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mod = torchvision.models.densenet121(pretrained=True)
+
+            def forward(self, x):
+                return self.mod(x)
+
+        inputs = [torch.randn(1, 3, 224, 224).cuda().half()]
+        model = TestModule().cuda().half()
+        self.run_test(
+            model,
+            inputs,
+            atol=0.18,
+            expected_ops={},
+            permute_inputs=[0, 2, 3, 1],
+            permute_outputs=None,
+            customized_passes=[
+                replace_inplace_ops,
+                nchw2nhwc_pass,
+            ],
+        )
+
+    # def test_hf_albert_base(self):
+    #     # config = AutoConfig.from_pretrained("albert-base-v2")
+    #     # config = AutoConfig.from_pretrained("gpt2")
+    #     # config = BertConfig()
+    #     config = AutoConfig.from_pretrained("allenai/longformer-base-4096")
+    #     max_length = 128
+    #     batch_size = 32
+    #     device = "cuda"
+
+    #     class TestModule(torch.nn.Module):
+    #         def __init__(self):
+    #             super().__init__()
+    #             self.mod = AutoModelForMaskedLM.from_config(config).to(device)
+
+    #         def forward(self, x):
+    #             return self.mod(x).logits
+
+    #     model = TestModule().cuda().half()
+    #     input_ids = torch.randint(0, config.vocab_size, (batch_size, max_length)).to(
+    #         device
+    #     )
+    #     inputs = [input_ids]
+    #     self.run_test(
+    #         model,
+    #         inputs,
+    #         expected_ops={},
+    #         # permute_inputs=[0, 2, 3, 1],
+    #         # permute_outputs=None,
+    #         customized_passes=[
+    #             replace_inplace_ops,
+    #             nchw2nhwc_pass,
+    #         ],
+    #     )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
new file mode 100644
index 000000000..5b84e3f2d
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_nan2num_aten.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestCatConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.tensor([float("nan"), float("inf"), -float("inf"), 3.14])
+            .half()
+            .cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.nan_to_num.default})
+
+    @parameterized.expand(
+        [
+            param("default"),
+            param("nan", nan=1.0),
+            param("posinf", posinf=1.0),
+            param("neginf", neginf=-1.0),
+        ]
+    )
+    def test_dynamic_nan_to_num(self, name, nan=None, posinf=None, neginf=None):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.nan_to_num(x, nan=nan, posinf=posinf, neginf=neginf)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [3, 8, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=float("nan"),
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=float("inf"),
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.nan_to_num.default},
+            specify_num=-float("inf"),
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
new file mode 100644
index 000000000..25215b557
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_permute_aten.py
@@ -0,0 +1,78 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestPermuteConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param((128, 512), (1, 0)),
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+        ]
+    )
+    def test_permute(self, shape, dims):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, dims)
+
+        model = TestModule().cuda()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.permute.default})
+
+    @parameterized.expand(
+        [
+            param((128, 500), (256, 512), (1, 0)),
+            param((80, 300, 2), (98, 512, 20), (0, 2, 1)),
+            param((80, 300, 2), (98, 512, 20), (1, 0, 2)),
+            param((80, 300, 2), (98, 512, 20), (2, 1, 0)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (0, 2, 1, 3)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (3, 2, 1, 0)),
+            param((3, 5, 128, 512), (6, 10, 256, 520), (2, 3, 0, 1)),
+            param((3, 1, 113, 15, 64), (6, 10, 128, 16, 128), (2, 0, 3, 1, 4)),
+        ]
+    )
+    def test_permute_dynamic_shape(self, input_min, input_max, dims):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.permute(x, dims)
+
+        model = TestModule().cuda()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                input_min,
+            ],
+            inputs_max=[
+                input_max,
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.permute.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
new file mode 100644
index 000000000..695fe94db
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_pow_aten.py
@@ -0,0 +1,73 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestPowConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("exp", size=[10], exp=5),
+            param("3d_exp", size=[2, 5, 32], exp=5),
+            param("4d_float_exp", size=[2, 5, 32, 128], exp=2.2),
+        ]
+    )
+    def test_pow(self, name, size, exp):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.pow(x, exponent=exp)
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn([2, 5, 32, 128]).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.pow.Tensor_Scalar})
+
+    @parameterized.expand(
+        [
+            param("exp", inputs_min=[10], inputs_max=[15], exp=5),
+            param("3d_exp", inputs_min=[2, 5, 32], inputs_max=[3, 7, 64], exp=5),
+            param(
+                "4d_float_exp",
+                inputs_min=[2, 5, 32, 128],
+                inputs_max=[20, 7, 35, 140],
+                exp=2.2,
+            ),
+        ]
+    )
+    def test_dynamic_pow(self, name, inputs_min, inputs_max, exp):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.pow(x, exponent=exp)
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [3, 8, 10],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.pow.Tensor_Scalar}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
new file mode 100644
index 000000000..2e627480c
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reduce_aten.py
@@ -0,0 +1,131 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+class TestSumConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.sum.dim_IntList})
+
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_dynamic_sum(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.sum(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        # The last dim has to be static to pre-compute vector_length:
+        # https://fburl.com/code/1x07doen
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 6, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.sum.dim_IntList}
+        )
+
+
+class TestMeanConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+        inputs = [torch.randn(1, 2, 3).half().cuda()]
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.mean.dim})
+
+    @parameterized.expand(
+        [
+            ["default", (1), False],
+            ["keepdim", (1), True],
+            ["negative_dim", (-1), False],
+            ["keepdim_2d", (0, 1), True],
+            ["nokeepdim_2d", (0, 1), False],
+            ["negative_2d", (-1, -2), False],
+            ["keepdim_3d", (0, 1, 2), True],
+        ]
+    )
+    def test_dynamic_mean(self, test_name, dim, keepdim):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.mean(x, dim=dim, keepdim=keepdim)
+
+        model = TestModule().cuda()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [20, 6, 8],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.mean.dim}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
new file mode 100644
index 000000000..b7f1ce91b
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_relu_aten.py
@@ -0,0 +1,61 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.op = torch.relu
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.op(x)
+
+
+class TestATenReluConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("small", size=(2, 3)),
+            param("large", size=(1024, 4096, 8)),
+        ]
+    )
+    def test_relu(self, name, size):
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(size).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.relu.default})
+
+    def test_relu_with_dynamic_shape(self):
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [1, 3, 4],
+            ],
+            inputs_max=[
+                [32, 1024, 2048],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.relu.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
new file mode 100644
index 000000000..5a1febf44
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_reshape_aten.py
@@ -0,0 +1,124 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestATenReshapeConverter(DispatchTestCase):
+    def test_reshape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (2, 12))
+
+        size = (2, 3, 4)
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(size).half().cuda(),)
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    def test_reshape_size(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                dim1_y = y.shape[0]
+                return torch.reshape(x, (dim1_y, -1, 128))
+
+        model = TestModule().cuda().half()
+        inputs = (
+            torch.randn(2, 10, 128).half().cuda(),
+            torch.randn(2, 10, 128).half().cuda(),
+        )
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.view.default})
+
+    def test_reshape_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (x.size(0), x.size(1) * x.size(2)))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [10, 30, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
+
+    def test_reshape_neg_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return torch.reshape(x, (x.size(0), -1))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 3, 4],
+            ],
+            inputs_max=[
+                [10, 30, 4],
+            ],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
+
+    # TODO: trigger assertion in AIT: AssertionError: When there is no unknown index, we expect dim products to be equal, got current shape numel=2560 != new shape prod=256
+    @unittest.skip
+    def test_reshape_size_with_dynamic_shape(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                dim1_y = y.shape[0]
+                return torch.reshape(x, (dim1_y, -1, 128))
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 10, 128],
+                [2, 10, 128],
+            ],
+            inputs_max=[
+                [20, 10, 128],
+                [20, 10, 128],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model,
+            inputs_spec,
+            expected_ops={torch.ops.aten.view.default},
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
new file mode 100644
index 000000000..bd076b77a
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_size_aten.py
@@ -0,0 +1,69 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+
+# from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+
+
+class TestATenSizeConverter(DispatchTestCase):
+    def test_size(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+                t = x.size()
+                return y.reshape(t)
+
+        xsize = (2, 3, 4)
+        ysize = (2, 12)
+        model = TestModule().cuda().half()
+        inputs = (torch.randn(xsize).half().cuda(), torch.randn(ysize).half().cuda())
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.sym_size})
+
+    ## AIT not support now
+    # def test_size_dim(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             return x.size(1)
+
+    #     size = (2, 3, 4)
+    #     model = TestModule().cuda().half()
+    #     inputs = (torch.randn(size).half().cuda(),)
+
+    #     self.run_test(model, inputs, expected_ops={torch.ops.aten.sym_size})
+
+    # def test_size_dim_with_dynamic_shape(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, x: torch.Tensor) -> torch.Tensor:
+    #             return x.size(1)
+
+    #     model = TestModule().cuda().half()
+    #     inputs_spec = TensorSpec.create_spec_from_shapes(
+    #         inputs_min=[
+    #             [2, 3, 4],
+    #         ],
+    #         inputs_max=[
+    #             [10, 30, 4],
+    #         ],
+    #         dtype_list=[
+    #             torch.float16,
+    #         ],
+    #     )
+
+    #     self.run_test_with_dynamic_shape(
+    #         model,
+    #         inputs_spec,
+    #         expected_ops={torch.ops.aten.sym_size},
+    #     )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
new file mode 100644
index 000000000..724d7d2c1
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_slice_tensor_aten.py
@@ -0,0 +1,222 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.passes.lower_basic_pass_aten import (
+    aten_compose_getitem_slice,
+    compose_getitem_slice,
+)
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+from torch import nn
+
+
+class TestSliceTensor(DispatchTestCase):
+    @parameterized.expand(
+        [
+            (
+                "integer_slice",
+                1,
+                {
+                    torch.ops.aten.select.int,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_batch_dim",
+                slice(None, None, None),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_basic",
+                (slice(None, None, None), slice(0, 3, 1)),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_full",
+                (slice(None, None, None), slice(0, 10, 1)),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            ## Trace problem in support of ellipsis
+            # (
+            #     "ellipsis",  # It seems there is some problem in tracing ellipsis: P539875442
+            #     (slice(None, None, None), ..., slice(0, 3, 1)),
+            #     {
+            #         torch.ops.aten.add.Tensor,
+            #     },
+            # ),
+            (
+                "slice_all_none",
+                (
+                    slice(None, None, None),
+                    slice(None, None, None),
+                ),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_start_none",
+                (
+                    slice(None, None, None),
+                    slice(None, 2, 1),
+                ),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_end_none",
+                (slice(None, None, None), slice(1, None, 1)),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_step_none",
+                (
+                    slice(None, None, None),
+                    slice(0, 3, None),
+                ),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_neg_idx",
+                (slice(None, None, None), -1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_neg_slice",
+                (slice(None, None, None), slice(-8, -2, 1)),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "multi_dim",
+                (slice(None, None, None), 0, 1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_multi_dim",
+                (slice(None, None, None), slice(0, 3, 1), slice(1, -1, 1)),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "none",
+                (slice(None, None, None), None, slice(1, -1, 1), 1),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "with_squeeze",
+                (slice(None, None, None), 1, slice(1, -1, 1), None),
+                {
+                    torch.ops.aten.slice.Tensor,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_zero_slice",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+                {
+                    aten_compose_getitem_slice,
+                    torch.ops.aten.add.Tensor,
+                },
+                None,
+            ),
+            (
+                "slice_basic_compose",
+                (slice(None, None, None), slice(None, None, None), slice(0, 3, 1)),
+                {
+                    torch.ops.aten.add.Tensor,
+                    aten_compose_getitem_slice,
+                },
+                [
+                    compose_getitem_slice,
+                ],
+            ),
+            (
+                "slice_zero_slice_compose",
+                (slice(None, None, None), slice(None, None, None), slice(0, 0, None)),
+                {
+                    torch.ops.aten.add.Tensor,
+                    aten_compose_getitem_slice,
+                },
+                [
+                    compose_getitem_slice,
+                ],
+            ),
+        ]
+    )
+    def test_slice_tensor(self, name, idx, expected_ops, customized_passes):
+        class SliceTensor(nn.Module):
+            def __init__(self, idx):
+                super().__init__()
+                self.idx = idx
+
+            def forward(self, x):
+                y = x + x
+                return y[self.idx]
+
+        mod = SliceTensor(idx).half().cuda()
+
+        inputs = [torch.randn(2, 10, 10, 10).half().cuda()]
+        self.run_test(
+            mod,
+            inputs,
+            expected_ops=expected_ops,
+            customized_passes=customized_passes,
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
new file mode 100644
index 000000000..ad4897dec
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_split_aten.py
@@ -0,0 +1,82 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestSplitConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "dim1",
+                dim=1,
+                split_size=3,
+                expected_ops={torch.ops.aten.split.Tensor},
+            ),
+            param(
+                "dim0",
+                dim=0,
+                split_size=3,
+                expected_ops={torch.ops.aten.split.Tensor},
+            ),
+        ]
+    )
+    def test_split(self, name, dim, split_size, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                res = torch.split(y, split_size, dim)
+                return res[0]
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(20, 10, 8).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops=expected_ops)
+
+    def test_split_dynamic(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                res = torch.split(y, 4, 1)
+                return res[0]
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[[20, 10, 8]],
+            inputs_max=[[50, 10, 8]],
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.split.Tensor}
+        )
+
+    # TODO low priority. May need to support it in future.
+    # def test_split_imm(self):
+    #     class TestModule(torch.nn.Module):
+    #         def forward(self, y: torch.Tensor) -> torch.Tensor:
+    #             dim1 = y.size(1)
+    #             split_size = dim1 // 2
+    #             return torch.split(y, split_size, 1)
+
+    #     model = TestModule().cuda().half()
+    #     inputs = [
+    #         torch.randn(2, 10, 20).half().cuda(),
+    #     ]
+    #     self.run_test(model, inputs, expected_ops={torch.ops.aten.split.Tensor})
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
new file mode 100644
index 000000000..c205f3cf6
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_squeeze_aten.py
@@ -0,0 +1,165 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import param, parameterized
+
+
+class TestSqueezeConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param(
+                "default",
+                dim=None,
+                shape=[2, 1, 1, 3],
+                expected_ops={torch.ops.aten.squeeze.default},
+            ),
+            param(
+                "1",
+                dim=1,
+                shape=[2, 1, 1, 3],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape=[2, 1, 3, 1],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+        ]
+    )
+    def test_squeeze(self, name, dim, shape, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops=expected_ops)
+
+    @parameterized.expand(
+        [
+            param(
+                "default",
+                dim=None,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.default},
+            ),
+            param(
+                "1",
+                dim=1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+                expected_ops={torch.ops.aten.squeeze.dim},
+            ),
+        ]
+    )
+    def test_dynamic_squeeze(self, name, dim, shape1, shape2, expected_ops):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                squeeze = (
+                    torch.squeeze(y, dim=dim) if dim is not None else torch.squeeze(y)
+                )
+                return squeeze
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=shape1,
+            inputs_max=shape2,
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops=expected_ops)
+
+
+class TestUnSqueezeConverter(DispatchTestCase):
+    @parameterized.expand(
+        [
+            param("1", dim=1, shape=[2, 1, 1, 3]),
+            param("-1", dim=-1, shape=[2, 1, 3, 1]),
+        ]
+    )
+    def test_unsqueeze(self, name, dim, shape):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                unsqueeze = (
+                    torch.unsqueeze(y, dim=dim)
+                    if dim is not None
+                    else torch.unsqueeze(y)
+                )
+                return unsqueeze
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(shape).half().cuda(),
+        ]
+
+        self.run_test(model, inputs, expected_ops={torch.ops.aten.unsqueeze.default})
+
+    @parameterized.expand(
+        [
+            param(
+                "1",
+                dim=1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+            ),
+            param(
+                "-1",
+                dim=-1,
+                shape1=[[2, 1, 3, 1]],
+                shape2=[[4, 1, 10, 1]],
+            ),
+        ]
+    )
+    def test_dynamic_squeeze(self, name, dim, shape1, shape2):
+        class TestModule(torch.nn.Module):
+            def forward(self, y: torch.Tensor) -> torch.Tensor:
+                unsqueeze = (
+                    torch.unsqueeze(y, dim=dim)
+                    if dim is not None
+                    else torch.unsqueeze(y)
+                )
+                return unsqueeze
+
+        model = TestModule().cuda().half()
+
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=shape1,
+            inputs_max=shape2,
+            dtype_list=[
+                torch.float16,
+            ],
+        )
+        self.run_test_with_dynamic_shape(
+            model, inputs_spec, expected_ops={torch.ops.aten.unsqueeze.default}
+        )
diff --git a/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
new file mode 100644
index 000000000..884e85312
--- /dev/null
+++ b/fx2ait/fx2ait/test/converters_aten/test_ait_unary_ops_aten.py
@@ -0,0 +1,74 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import Callable
+
+import torch
+from fx2ait.tensor_spec import TensorSpec
+from fx2ait.tools.common_aten2ait import DispatchTestCase
+from parameterized import parameterized
+
+
+unary_ops = [
+    (torch.abs, torch.ops.aten.abs.default),
+    (torch.log, torch.ops.aten.log.default),
+    (torch.sigmoid, torch.ops.aten.sigmoid.default),
+    (torch.sign, torch.ops.aten.sign.default),
+    (torch.tanh, torch.ops.aten.tanh.default),
+    (torch.sin, torch.ops.aten.sin.default),
+    (torch.cos, torch.ops.aten.cos.default),
+    (torch.sqrt, torch.ops.aten.sqrt.default),
+    (
+        torch.clone,
+        torch.ops.aten.mul.Tensor,
+    ),  # clone op can not be the output directly so expected is the op after it(aten.mul)
+]
+
+
+class TestUnaryOpsConverter(DispatchTestCase):
+    @parameterized.expand([(op[1].__name__, op[0], op[1]) for op in unary_ops])
+    def test_unary_ops(self, name, orig_op: Callable, expected_op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x) * 2
+
+        model = TestModule().cuda().half()
+        inputs = [
+            torch.randn(1, 2, 3).half().cuda(),
+        ]
+        _ = model(*inputs)
+        self.run_test(model, inputs, expected_ops={expected_op})
+
+    @parameterized.expand([(op[1].__name__, op[0], op[1]) for op in unary_ops])
+    def test_dynamic_unary_ops(self, name, orig_op: Callable, expected_op):
+        class TestModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return orig_op(x) * 2
+
+        model = TestModule().cuda().half()
+        inputs_spec = TensorSpec.create_spec_from_shapes(
+            inputs_min=[
+                [2, 8, 10],
+            ],
+            inputs_max=[
+                [20, 12, 32],
+            ],
+            dtype_list=[
+                torch.float16,
+                torch.float16,
+                torch.float16,
+            ],
+        )
+
+        self.run_test_with_dynamic_shape(model, inputs_spec, expected_ops={expected_op})
diff --git a/fx2ait/fx2ait/test/test_ait_lower.py b/fx2ait/fx2ait/test/test_ait_lower.py
new file mode 100644
index 000000000..214b71b04
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_ait_lower.py
@@ -0,0 +1,92 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from fx2ait.lower.lower import AitLowerer
+from fx2ait.lower.lower_settings import LowerSettings
+
+
+@torch.fx.wrap
+def get_length(input: torch.Tensor) -> int:
+    return len(input)
+
+
+@torch.fx.wrap
+def unsupported_op(x):
+    return x + x
+
+
+class TestFx2aitLowerTests(unittest.TestCase):
+    def test_fx2ait_lower(self):
+        class TestMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                values = torch.sigmoid(x)
+                return get_length(values)
+
+        mod = TestMod().half().cuda()
+        input = [torch.randn(2, 3).half().cuda()]
+        ref_output = mod(*input)
+        lower = AitLowerer.create(
+            LowerSettings(workdir="/tmp", name="test_ait_lower", min_acc_module_size=0)
+        )
+        lowered = lower(mod, input)
+        lower_output = lowered(*input)
+        self.assertTrue(len(lowered._modules.keys()), 2)
+        torch.testing.assert_close(ref_output, lower_output, check_dtype=False)
+
+        # Verify that the resulting module is scriptable and
+        # the scripted module is working properly with dynamic batch input
+        # TODO: Enable script test after python release include fix:
+        # https://github.com/pytorch/pytorch/pull/87804
+        # scripted = torch.jit.script(lowered)
+        # input2 = [torch.randn(16, 3).half().cuda()]
+        # ref_output2 = mod(*input2)
+        # torch.testing.assert_close(ref_output2, scripted(*input2), check_dtype=False)
+
+    def test_fx2ait_lower_avoids_copies(self):
+        class TestMod(torch.nn.Module):
+            def forward(self, x):
+                a = unsupported_op(x)
+                b = a.unsqueeze(0)
+                return unsupported_op(b)
+
+        mod = TestMod().half().cuda()
+        x = torch.randn((1,)).half().cuda()
+        ref_output = mod(x)
+        lowerer = AitLowerer.create(
+            LowerSettings(
+                workdir="/tmp",
+                name="test_ait_lower_avoids_copies",
+                min_acc_module_size=0,
+            )
+        )
+        lowered = lowerer(mod, [x])
+        lower_output = lowered(x)
+        torch.testing.assert_close(ref_output, lower_output, check_dtype=False)
+
+        children = list(lowered.named_children())
+        self.assertEqual(len(children), 1)
+        name, _ = children[0]
+        self.assertNotIn("_run_on_acc", name)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/test/test_ait_splitter.py b/fx2ait/fx2ait/test/test_ait_splitter.py
new file mode 100644
index 000000000..407f6ac86
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_ait_splitter.py
@@ -0,0 +1,236 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import torch
+from torch.fx.passes import operator_support as op_support
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+from fx2ait.acc_tracer import acc_ops, acc_tracer
+from fx2ait.ait_splitter import (  # @manual=//aitemplate/AITemplate/fx2ait/fx2ait:fx2ait
+    AITSplitter,
+    AITSplitterSettings,
+    create_ait_operator_support,
+)
+from fx2ait.tools.common_fx2ait import AITTestCase
+
+
+class TestSplit(AITTestCase):
+    def test_exclude_support_node_by_name(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.sin(a)
+                c = torch.relu(b)
+                d = torch.cos(c)
+                e = torch.sigmoid(d)
+                f = torch.tanh(e)
+                return f
+
+        # Support all ops
+        _support_dict = {
+            "acc_ops.sin": None,
+            "acc_ops.cos": None,
+            "acc_ops.relu": None,
+            "acc_ops.sigmoid": None,
+            "acc_ops.tanh": None,
+        }
+        custom_op_support = op_support.OperatorSupport(_support_dict)
+
+        # With no ops excluded, the entire module should be lowered
+        # into one acc graph
+        mod = acc_tracer.trace(TestModule(), [torch.randn(2, 3)])
+        settings = AITSplitterSettings(min_acc_module_size=0)
+        splitter = AITSplitter(
+            mod,
+            (torch.randn(2, 3),),
+            custom_op_support,
+            settings,
+        )
+
+        res_no_exclusion = splitter.generate_split_results()
+        split_named_mods = dict(res_no_exclusion.split_module.named_children())
+        self.assertEqual(len(split_named_mods), 1)
+        self.assertIn("_run_on_acc_0", split_named_mods)
+
+        # Add "relu" to exclude_support_node_name
+        # The graph should be split into 3 parts now(_run_on_acc_0, _run_on_gpu_1, _run_on_acc_2)
+        mod = acc_tracer.trace(TestModule(), [torch.randn(2, 3)])
+        settings.exclude_support_node_name.add("relu_1")
+        splitter = AITSplitter(
+            mod,
+            (torch.randn(2, 3),),
+            custom_op_support,
+            settings,
+        )
+        res_post_exclusion = splitter.generate_split_results()
+        split_named_mods = dict(res_post_exclusion.split_module.named_children())
+        self.assertEqual(len(split_named_mods), 3)
+        self.assertIn("_run_on_acc_0", split_named_mods)
+        self.assertIn("_run_on_gpu_1", split_named_mods)
+        self.assertIn("_run_on_acc_2", split_named_mods)
+
+        run_on_acc_0_nodes = [
+            n
+            for n in split_named_mods["_run_on_acc_0"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_acc_0_nodes), 1)
+        self.assertEqual(acc_tracer.acc_ops.sin, run_on_acc_0_nodes[0].target)
+
+        run_on_gpu_1_nodes = [
+            n
+            for n in split_named_mods["_run_on_gpu_1"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_gpu_1_nodes), 1)
+        self.assertEqual(acc_tracer.acc_ops.relu, run_on_gpu_1_nodes[0].target)
+
+        run_on_acc_2_nodes = [
+            n
+            for n in split_named_mods["_run_on_acc_2"].graph.nodes
+            if n.op == "call_function"
+        ]
+        self.assertEqual(len(run_on_acc_2_nodes), 3)
+        self.assertEqual(acc_tracer.acc_ops.cos, run_on_acc_2_nodes[0].target)
+        self.assertEqual(acc_tracer.acc_ops.sigmoid, run_on_acc_2_nodes[1].target)
+        self.assertEqual(acc_tracer.acc_ops.tanh, run_on_acc_2_nodes[2].target)
+
+    def test_decline_if_input_dtype(self):
+        operator_support = create_ait_operator_support()
+
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.relu(a)
+                return b
+
+        test_mod = TestModule().cuda().eval()
+        x = torch.randn(2, 3)
+        mod = acc_tracer.trace(test_mod, [x])
+        settings = AITSplitterSettings()
+        settings.min_acc_module_size = 0
+        # nodes w/ float16 input should be lowered
+        splitter = AITSplitter(
+            mod,
+            (x.half().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_half = splitter.generate_split_results()
+        self.assertTrue(len(split_results_half), 1)
+        self.assertEqual(
+            dict(split_results_half.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )
+
+        # nodes w/ float64 input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.double().cuda(),),
+            operator_support,
+            settings,
+        )
+
+        split_results_double = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_double), 1)
+        self.assertEqual(
+            dict(split_results_double.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )
+
+        # nodes w/ integer input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+
+        split_results_int = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_int), 1)
+        self.assertEqual(
+            dict(split_results_int.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )
+
+        # nodes w/ integer input should be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        settings.allow_int_inputs = True
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            settings=settings,
+        )
+
+        split_results_int_allowed = splitter.generate_split_results()
+
+        self.assertTrue(len(split_results_int_allowed), 1)
+        self.assertEqual(
+            dict(split_results_int_allowed.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )
+
+    def test_accept_if_allow_op_support(self):
+        operator_support = create_ait_operator_support()
+
+        class TestModule(torch.nn.Module):
+            def forward(self, a):
+                b = torch.relu(a)
+                return b
+
+        test_mod = TestModule().cuda().eval()
+        x = torch.randn(2, 3)
+        settings = AITSplitterSettings()
+        settings.min_acc_module_size = 0
+
+        # nodes w/ int input should not be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_int = splitter.generate_split_results()
+        self.assertTrue(len(split_results_int), 1)
+        self.assertEqual(
+            dict(split_results_int.split_module.named_children()).keys(),
+            {"_run_on_gpu_0"},
+        )
+
+        class JaggedOperatorSupport(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return node.op == "call_function" and node.target in [
+                    acc_ops.relu,
+                ]
+
+        operator_support = create_ait_operator_support(
+            allow_op_supports=[JaggedOperatorSupport()]
+        )
+        # node relu should be lowered
+        mod = acc_tracer.trace(test_mod, [x])
+        splitter = AITSplitter(
+            mod,
+            (x.int().cuda(),),
+            operator_support,
+            settings,
+        )
+        split_results_relu_allowed = splitter.generate_split_results()
+        self.assertTrue(len(split_results_relu_allowed), 1)
+        self.assertEqual(
+            dict(split_results_relu_allowed.split_module.named_children()).keys(),
+            {"_run_on_acc_0"},
+        )
diff --git a/fx2ait/fx2ait/test/test_fx2ait.py b/fx2ait/fx2ait/test/test_fx2ait.py
new file mode 100644
index 000000000..880354a70
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_fx2ait.py
@@ -0,0 +1,162 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import io
+import os
+import tempfile
+import unittest
+
+import torch
+
+from fx2ait.acc_tracer import acc_tracer
+from fx2ait.ait_module import AITModule
+from fx2ait.extension import is_oss_ait_model
+from fx2ait.fx2ait import AITInterpreter
+
+AIT_MODEL_CLASS = (
+    torch.classes.ait.AITModel if is_oss_ait_model() else torch.classes.fb.AITModel
+)
+
+
+class TestAITModule(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_fx2ait_impl(self, test_serialization=False, test_cuda_graph=False):
+        mod = (
+            torch.nn.Sequential(
+                torch.nn.Linear(3, 4),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+                torch.nn.ReLU(),
+            )
+            .half()
+            .cuda()
+        )
+        inputs = [torch.randn(5, 3).half().cuda()]
+        ref_output = mod(*inputs)
+
+        traced = acc_tracer.trace(mod, inputs)
+
+        ait_dump_dir = tempfile.mkdtemp(prefix="test_fx2ait_", dir="/tmp")
+
+        interp = AITInterpreter(traced, inputs, ait_dump_dir, "test")
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            AIT_MODEL_CLASS(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float16,
+                1,  # num_runtimes
+            )
+        )
+        ait_mod.engine.use_cuda_graph = test_cuda_graph
+        if test_serialization:
+            buf = io.BytesIO()
+            # Have to JIT-ify the module before we can save/load it.
+            ait_mod = torch.jit.trace(ait_mod, inputs)
+            script_output = ait_mod(*inputs)
+            torch.testing.assert_close(script_output, ref_output, atol=0.1, rtol=0.1)
+            torch.jit.save(ait_mod, buf)
+            buf.seek(0)
+            AIT_MODEL_CLASS.register_library_name_to_path_map(
+                {
+                    os.path.basename(
+                        interp_result.engine.lib_path
+                    ): interp_result.engine.lib_path
+                }
+            )
+            ait_mod = torch.jit.load(buf)
+        ait_output = ait_mod(*inputs)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+        if not is_oss_ait_model():
+            weights = {
+                "_0_weight": torch.ones(3, 4).cuda().half(),
+                "_0_bias": torch.randn(4).cuda().half(),
+            }
+            ait_mod.engine.update_constants_with_weights(weights)
+            ait_output = ait_mod(*inputs)
+            torch.testing.assert_close(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+            ait_mod.engine.swap_constants()
+            ait_output = ait_mod(*inputs)
+            self.assertFalse(
+                torch.allclose(ait_output, ref_output, atol=1e-2, rtol=1e-2)
+            )
+
+    def test_fx2ait(self):
+        self._test_fx2ait_impl(test_serialization=False)
+
+    def test_fx2ait_module_serialization(self):
+        self._test_fx2ait_impl(test_serialization=True)
+
+    def test_fx2ait_cuda_graph(self):
+        self._test_fx2ait_impl(test_cuda_graph=True)
+
+    def test_fx2ait_args(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, a, b, c, d):
+                temp = a + b[0] + b[1] + c[0] + c[1] + d
+                return temp
+
+        mod = TestModule().half().cuda()
+
+        a = torch.randn(5, 3).half().cuda()
+        b = [torch.randn(5, 3).half().cuda(), torch.randn(5, 3).half().cuda()]
+        c = [torch.randn(5, 3).half().cuda(), torch.randn(5, 3).half().cuda()]
+        d = torch.randn(5, 3).half().cuda()
+        ref_output = mod(a, b, c, d)
+
+        traced = acc_tracer.trace(mod, [a, b, c, d])
+
+        ait_dump_dir = tempfile.mkdtemp(prefix="test_fx2ait_", dir="/tmp")
+
+        interp = AITInterpreter(traced, [a, b, c, d], ait_dump_dir, "test")
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            AIT_MODEL_CLASS(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float16,
+                1,  # num_runtimes
+            ),
+            interp_result,
+        )
+        ait_output = ait_mod(a, b, c, d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b, c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b, c=c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+        ait_output = ait_mod(a, b=b, c=c, d=d)
+        torch.testing.assert_close(ait_output, ref_output, atol=0.1, rtol=0.1)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/fx2ait/fx2ait/test/test_tensor_spec.py b/fx2ait/fx2ait/test/test_tensor_spec.py
new file mode 100644
index 000000000..d672a4cb5
--- /dev/null
+++ b/fx2ait/fx2ait/test/test_tensor_spec.py
@@ -0,0 +1,181 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler.public import IntImm, IntVar
+from parameterized import parameterized
+
+from fx2ait.tensor_spec import TensorSpec
+
+
+class TestTensorSpec(unittest.TestCase):
+    def test_two_input_lists(self):
+        inputs1 = [
+            torch.empty([1, 3, 4], dtype=torch.float16),
+            torch.empty([5, 6], dtype=torch.int32),
+            [
+                torch.empty([7, 128, 9], dtype=torch.float16),
+                torch.empty([1, 16], dtype=torch.float16),
+            ],
+        ]
+        inputs2 = [
+            torch.empty([32, 3, 4], dtype=torch.float16),
+            torch.empty([5, 6], dtype=torch.int32),
+            [
+                torch.empty([7, 1, 9], dtype=torch.float16),
+                torch.empty([32, 16], dtype=torch.float16),
+            ],
+        ]
+
+        specs = TensorSpec.from_two_input_lists(inputs1, inputs2)
+
+        self.assertEqual(3, len(specs))
+        self.assertEqual(2, len(specs[2]))
+        self.assertEqual(
+            TensorSpec(
+                [IntVar([1, 32], "dynamic_dim_0"), IntImm(3), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(TensorSpec([IntImm(5), IntImm(6)], torch.int32), specs[1])
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 128], "dynamic_dim_1"), IntImm(9)], torch.float16
+            ),
+            specs[2][0],
+        )
+        self.assertEqual(
+            TensorSpec([IntVar([1, 32], "dynamic_dim_0"), IntImm(16)], torch.float16),
+            specs[2][1],
+        )
+
+    @parameterized.expand(
+        [
+            ("single", [([10, 3, 4], torch.float16)]),
+            (
+                "multi",
+                [
+                    ([10, 3, 4], torch.float16),
+                    ([10, 6], torch.int32),
+                    ([10, 8, 9], torch.float16),
+                ],
+            ),
+            (
+                "different_bs_dim",
+                [
+                    ([10, 3, 4], torch.float16),
+                    ([10, 6], torch.int32),
+                    ([4, 10, 9], torch.float16),
+                ],
+            ),
+            (
+                "same_shapes",
+                [
+                    ([10, 3, 40, 5], torch.float16),
+                    ([10, 3, 40, 5], torch.float16),
+                    ([10, 3, 40, 5], torch.float32),
+                ],
+            ),
+            (
+                "leftmost_bs_dim",
+                [
+                    ([10, 20, 30], torch.float16),
+                    ([10, 30, 20], torch.float16),
+                    ([20, 10, 30], torch.float32),
+                ],
+            ),
+        ]
+    )
+    def test_input_list_with_batch_size(self, _, settings):
+        inputs = [torch.empty(setting[0], dtype=setting[1]) for setting in settings]
+        # Test case default batch_size = 10, avoid set other shape param with this value
+        batch_size = 10
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32)
+        self.assertEqual(len(settings), len(specs))
+        for index, setting in enumerate(settings):
+            expected_shape = setting[0]
+            expected_spec = []
+            for shape in expected_shape:
+                if shape == batch_size:
+                    expected_spec.append(IntVar([1, 32], "batch_size"))
+                else:
+                    expected_spec.append(IntImm(shape))
+
+            self.assertEqual(
+                TensorSpec(expected_spec, setting[1]),
+                specs[index],
+            )
+
+    def test_input_list_with_batch_size_non_default_dim(self):
+        inputs = [
+            torch.empty([2, 10, 4], dtype=torch.float16),
+            torch.empty([5, 10], dtype=torch.int32),
+            torch.empty([7, 10, 9], dtype=torch.float16),
+        ]
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32, 1)
+        self.assertEqual(3, len(specs))
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(2), IntVar([1, 32], "batch_size"), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(
+            TensorSpec([IntImm(5), IntVar([1, 32], "batch_size")], torch.int32),
+            specs[1],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)], torch.float16
+            ),
+            specs[2],
+        )
+
+    def test_input_with_no_bs_tensor(self):
+        inputs = [
+            torch.empty([2, 10, 4], dtype=torch.float16),
+            torch.empty([20], dtype=torch.int32),
+            torch.empty([7, 10, 9], dtype=torch.float16),
+            torch.empty([20, 7, 10, 9], dtype=torch.float16),
+        ]
+
+        specs = TensorSpec.from_input_list_with_batch_size(inputs, 32, 1)
+        self.assertEqual(4, len(specs))
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(2), IntVar([1, 32], "batch_size"), IntImm(4)], torch.float16
+            ),
+            specs[0],
+        )
+        self.assertEqual(
+            TensorSpec([IntImm(20)], torch.int32),
+            specs[1],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)], torch.float16
+            ),
+            specs[2],
+        )
+        self.assertEqual(
+            TensorSpec(
+                [IntImm(20), IntImm(7), IntVar([1, 32], "batch_size"), IntImm(9)],
+                torch.float16,
+            ),
+            specs[3],
+        )
diff --git a/fx2ait/fx2ait/tools/__init__.py b/fx2ait/fx2ait/tools/__init__.py
new file mode 100644
index 000000000..5cf1a826f
--- /dev/null
+++ b/fx2ait/fx2ait/tools/__init__.py
@@ -0,0 +1,14 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
diff --git a/fx2ait/fx2ait/tools/ait_minimizer.py b/fx2ait/fx2ait/tools/ait_minimizer.py
new file mode 100644
index 000000000..95b9e961c
--- /dev/null
+++ b/fx2ait/fx2ait/tools/ait_minimizer.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+from typing import Any, Callable, List, Tuple
+
+import torch
+import torch.fx.passes.net_min_base as net_min_base
+
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter, TensorSpec
+from torch.fx.passes.tools_common import Tensors
+
+_LOGGER: logging.Logger = logging.getLogger(__name__)
+
+
+def lower_mod_default(
+    mod: torch.fx.GraphModule,
+    inputs: List[TensorSpec],
+    workdir: str,
+    name: str,
+    dll_name: str,
+) -> AITModule:
+    interp = AITInterpreter(mod, inputs, workdir, name, dll_name)
+    interpreter_result = interp.run()
+    res_mod = AITModule(
+        torch.classes.fb.AITModel(
+            interpreter_result.engine.lib_path,
+            interpreter_result.input_names,
+            interpreter_result.output_names,
+            torch.float16,
+            torch.float16,
+            1,  # num_runtimes
+        ),
+        interpreter_result,
+    )
+    return res_mod
+
+
+class AITMinizerSetting(net_min_base._MinimizerSettingBase):
+    def __init__(self):
+        super().__init__()
+
+
+class AITMinimizer(net_min_base._MinimizerBase):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        sample_input: Tensors,
+        compare_fn: Callable[[Any, Any, Any], Tuple[float, bool]] = lambda a, b, c: (
+            torch.dist(a, b),
+            torch.allclose(a, b),
+        ),
+        settings: AITMinizerSetting = AITMinizerSetting(),
+        lower_fn: Callable[
+            [torch.fx.GraphModule, Tensors, str, str, str], AITModule
+        ] = lower_mod_default,
+        workdir: str = "./tmp/AITMinimizer",
+        name: str = "minimize_module",
+    ):
+        self.lower_fn = lower_fn
+        self.workdir = workdir
+        self.name = name
+        self.curr_iter = 0  # We use this counter to prevent duplicate .so naming
+        super().__init__(module, sample_input, compare_fn, settings)
+
+    def run_a(self, mod, inputs):
+        mod.eval()
+        with torch.no_grad():
+            return mod(*inputs)
+
+    def run_b(self, mod, inputs):
+        mod.eval()
+        dll_name = f"{self.name}_{self.curr_iter}.so"
+        self.curr_iter += 1
+        try:
+            mod = self.lower_fn(mod, inputs, self.workdir, self.name, dll_name)
+            output = mod(*inputs)
+        except RuntimeError as e:
+            raise net_min_base.FxNetMinimizerRunFuncError(
+                f"Encounter an error when processing \n{mod.graph}\n {e}"
+            )
+        else:
+            return output
+
+    def get_nodes(self, start=None, end=None, enable_print=False):
+        nodes = self._collect_nodes(start, end)
+        if enable_print:
+            _LOGGER.info(f"Nodes fetched from start {start} to end {end} as: {nodes}")
+        return nodes
diff --git a/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py b/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py
new file mode 100644
index 000000000..e3fa593e2
--- /dev/null
+++ b/fx2ait/fx2ait/tools/ait_subgraph_rewriter.py
@@ -0,0 +1,481 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import copy
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NamedTuple, Optional, Set, Union
+
+import torch
+
+from torch.fx._symbolic_trace import symbolic_trace
+from torch.fx.graph import Graph
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.passes.utils.matcher_utils import InternalMatch, SubgraphMatcher
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class AITSubgraphMatcher(SubgraphMatcher):
+    def __init__(
+        self,
+        pattern: Graph,
+        match_output: bool = False,
+        match_placeholder: bool = False,
+        remove_overlapping_matches: bool = True,
+    ):
+        super(AITSubgraphMatcher, self).__init__(
+            pattern, match_output, match_placeholder, remove_overlapping_matches
+        )
+
+    def _match_args(self, pn: Any, gn: Any, match: InternalMatch) -> bool:
+        logger.info(f"  matching arguments: {pn} to {gn}")
+        assert not (
+            isinstance(pn, Node) and isinstance(gn, Node)
+        ), "pn and gn cannot both be Node"
+        if isinstance(pn, Node) and not isinstance(gn, Node):
+            if pn.op == "placeholder":
+                # Check if we've already matched these nodes in the current
+                # traversal
+                if pn in match.nodes_map:
+                    return match.nodes_map[pn] == gn
+
+                match.nodes_map[pn] = gn
+                return True
+            elif pn.op == "call_function" and pn.target == torch.ops.aten.sym_size:
+                return True
+            else:
+                return False
+        elif not isinstance(pn, Node) and isinstance(gn, Node):
+            return False
+        else:
+            return type(gn) == type(pn) and gn == pn
+
+    def _match_nodes(self, pn: Node, gn: Node, match: InternalMatch) -> bool:
+        logger.info(f"  matching node: {pn} to {gn}")
+        # breakpoint()
+        assert isinstance(pn, Node) and isinstance(gn, Node), str(
+            f"pn and gn must be Node, pn: {pn}, gn: {gn}"
+        )
+        if (
+            pn.target == torch.ops.aten.sym_size
+            and gn.target == torch.ops.aten.sym_size
+        ):
+            return True
+        # Check if we've already matched these nodes in the current
+        # traversal
+        if pn in match.nodes_map:
+            return match.nodes_map[pn] == gn
+
+        # TODO: use a more efficienty way to check if gn is matched before: two-way dict
+        if gn in match.nodes_map.values():
+            return False
+
+        if not self._nodes_are_equal(pn, gn):
+            return False
+
+        # Optimistically mark `pn` as a match for `gn`, and save a local copy of match
+        saved_match = copy.copy(match)
+        match.nodes_map[pn] = gn
+
+        if pn.op == "placeholder":
+            return True
+
+        # Recursively traverse upwards to check if `pn` is a true
+        # match for `gn`
+        match_found = True
+
+        def flatten_args(args) -> List[Any]:
+            # Recursively flatten args
+            result: List[Any] = []
+            for arg in args:
+                # flatten the list, if only it's a list/tuple of nodes
+                if (
+                    isinstance(arg, (list, tuple))
+                    and len(arg) > 0
+                    and isinstance(arg[0], Node)
+                ):
+                    result.extend(flatten_args(arg))
+                else:
+                    result.append(arg)
+
+            return result
+
+        pn_flatten_args = flatten_args(pn.args)
+        gn_flatten_args = flatten_args(gn.args)
+
+        if pn.kwargs.keys() == gn.kwargs.keys():
+            for key in pn.kwargs.keys():
+                pn_flatten_args.append(pn.kwargs[key])
+                gn_flatten_args.append(gn.kwargs[key])
+        else:
+            match_found = False
+
+        if match_found and len(pn_flatten_args) == len(gn_flatten_args):
+            for pn_, gn_ in zip(pn_flatten_args, gn_flatten_args):
+                if isinstance(gn_, Node) and isinstance(pn_, Node):
+                    matched = self._match_nodes(pn_, gn_, match)
+                else:
+                    matched = self._match_args(pn_, gn_, match)
+                if not matched:
+                    match_found = False
+                    break
+        else:
+            match_found = False
+
+        if not match_found:
+            # revert to saved_match before matching with current node
+            match = copy.copy(saved_match)
+            return False
+
+        return True
+
+
+__all__ = [
+    "Match",
+    "replace_pattern",
+    "ReplacedPatterns",
+]
+
+
+class Match(NamedTuple):
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+
+
+@dataclass
+class ReplacedPatterns:
+    # Node from which the match was found
+    anchor: Node
+    # Maps nodes in the pattern subgraph to nodes in the larger graph
+    nodes_map: Dict[Node, Node]
+    # List of nodes that were added into the graph
+    replacements: List[Node]
+
+
+def _replace_submodules(gm: GraphModule, replacement: torch.nn.Module) -> None:
+    gm.delete_all_unused_submodules()
+
+    if isinstance(replacement, GraphModule):
+        replacement.graph.lint()
+
+    def try_get_submodule(
+        mod: torch.nn.Module, target: str
+    ) -> Optional[torch.nn.Module]:
+        try:
+            mod_match = mod.get_submodule(target)
+            return mod_match
+        except AttributeError:
+            return None
+
+    for node in gm.graph.nodes:
+        if node.op == "call_module" or node.op == "get_attr":
+            gm_submod = try_get_submodule(gm, node.target)
+
+            replacement_submod = try_get_submodule(replacement, node.target)
+
+            # CASE 1: This target already exists as a submodule in our
+            # result GraphModule. Whether or not it exists in
+            # `replacement`, the existing submodule takes precedence.
+            if gm_submod is not None:
+                continue
+
+            # CASE 2: The target exists as a submodule in `replacement`
+            # only, so we need to copy it over.
+            elif replacement_submod is not None:
+                new_submod = copy.deepcopy(getattr(replacement, node.target))
+                gm.add_submodule(node.target, new_submod)
+
+            # CASE 3: The target doesn't exist as a submodule in `gm`
+            # or `replacement`
+            else:
+                raise RuntimeError(
+                    'Attempted to create a "',
+                    node.op,
+                    '" node during subgraph rewriting '
+                    f"with target {node.target}, but "
+                    "the referenced submodule does not "
+                    "exist in either the original "
+                    "GraphModule `gm` or the replacement"
+                    " GraphModule `replacement`",
+                )
+
+    gm.graph.lint()
+
+
+def replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None,  # type: ignore[name-defined]
+) -> List[Match]:
+    """
+    Matches all possible non-overlapping sets of operators and their
+    data dependencies (``pattern``) in the Graph of a GraphModule
+    (``gm``), then replaces each of these matched subgraphs with another
+    subgraph (``replacement``).
+
+    Args:
+        ``gm``: The GraphModule that wraps the Graph to operate on
+        ``pattern``: The subgraph to match in ``gm`` for replacement
+        ``replacement``: The subgraph to replace ``pattern`` with
+
+    Returns:
+        List[Match]: A list of ``Match`` objects representing the places
+        in the original graph that ``pattern`` was matched to. The list
+        is empty if there are no matches. ``Match`` is defined as:
+
+        .. code-block:: python
+
+            class Match(NamedTuple):
+                # Node from which the match was found
+                anchor: Node
+                # Maps nodes in the pattern subgraph to nodes in the larger graph
+                nodes_map: Dict[Node, Node]
+
+    Examples:
+
+    .. code-block:: python
+
+        import torch
+        from torch.fx import symbolic_trace, subgraph_rewriter
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, w1, w2):
+                m1 = torch.cat([w1, w2]).sum()
+                m2 = torch.cat([w1, w2]).sum()
+                return x + torch.max(m1) + torch.max(m2)
+
+        def pattern(w1, w2):
+            return torch.cat([w1, w2]).sum()
+
+        def replacement(w1, w2):
+            return torch.stack([w1, w2])
+
+        traced_module = symbolic_trace(M())
+
+        subgraph_rewriter.replace_pattern(traced_module, pattern, replacement)
+
+    The above code will first match ``pattern`` in the ``forward``
+    method of ``traced_module``. Pattern-matching is done based on
+    use-def relationships, not node names. For example, if you had
+    ``p = torch.cat([a, b])`` in ``pattern``, you could match
+    ``m = torch.cat([a, b])`` in the original ``forward`` function,
+    despite the variable names being different (``p`` vs ``m``).
+
+    The ``return`` statement in ``pattern`` is matched based on its
+    value only; it may or may not match to the ``return`` statement in
+    the larger graph. In other words, the pattern doesn't have to extend
+    to the end of the larger graph.
+
+    When the pattern is matched, it will be removed from the larger
+    function and replaced by ``replacement``. If there are multiple
+    matches for ``pattern`` in the larger function, each non-overlapping
+    match will be replaced. In the case of a match overlap, the first
+    found match in the set of overlapping matches will be replaced.
+    ("First" here being defined as the first in a topological ordering
+    of the Nodes' use-def relationships. In most cases, the first Node
+    is the parameter that appears directly after ``self``, while the
+    last Node is whatever the function returns.)
+
+    One important thing to note is that the parameters of the
+    ``pattern`` Callable must be used in the Callable itself,
+    and the parameters of the ``replacement`` Callable must match
+    the pattern. The first rule is why, in the above code block, the
+    ``forward`` function has parameters ``x, w1, w2``, but the
+    ``pattern`` function only has parameters ``w1, w2``. ``pattern``
+    doesn't use ``x``, so it shouldn't specify ``x`` as a parameter.
+    As an example of the second rule, consider replacing
+
+    .. code-block:: python
+
+        def pattern(x, y):
+            return torch.neg(x) + torch.relu(y)
+
+    with
+
+    .. code-block:: python
+
+        def replacement(x, y):
+            return torch.relu(x)
+
+    In this case, ``replacement`` needs the same number of parameters
+    as ``pattern`` (both ``x`` and ``y``), even though the parameter
+    ``y`` isn't used in ``replacement``.
+
+    After calling ``subgraph_rewriter.replace_pattern``, the generated
+    Python code looks like this:
+
+    .. code-block:: python
+
+        def forward(self, x, w1, w2):
+            stack_1 = torch.stack([w1, w2])
+            sum_1 = stack_1.sum()
+            stack_2 = torch.stack([w1, w2])
+            sum_2 = stack_2.sum()
+            max_1 = torch.max(sum_1)
+            add_1 = x + max_1
+            max_2 = torch.max(sum_2)
+            add_2 = add_1 + max_2
+            return add_2
+    """
+    match_and_replacements = _replace_pattern(gm, pattern, replacement, match_filters)
+    return [
+        Match(anchor=m.anchor, nodes_map=m.nodes_map) for m in match_and_replacements
+    ]
+
+
+def _replace_pattern(
+    gm: GraphModule,
+    pattern: Union[Callable, GraphModule],
+    replacement: Union[Callable, GraphModule],
+    match_filters: List[Callable[["InternalMatch", Graph, Graph], bool]] = None,  # type: ignore[name-defined]
+) -> List[ReplacedPatterns]:
+
+    if match_filters is None:
+        match_filters = []
+
+    # Get the graphs for `gm`, `pattern`, `replacement`
+    original_graph: Graph = gm.graph
+
+    if isinstance(pattern, GraphModule):
+        pattern_graph = pattern.graph
+    else:
+        pattern_graph = symbolic_trace(pattern).graph
+
+    if isinstance(replacement, GraphModule):
+        replacement_graph = replacement.graph
+    else:
+        replacement_graph = symbolic_trace(replacement).graph
+    matcher = AITSubgraphMatcher(
+        pattern_graph,
+        match_output=False,
+        match_placeholder=False,
+        remove_overlapping_matches=True,
+    )
+
+    _matches: List[InternalMatch] = matcher.match(original_graph)
+    logger.info(f"matches = {_matches}")
+    # Filter out matches that don't match the filter
+    _matches = [
+        m
+        for m in _matches
+        if all(
+            match_filter(m, original_graph, pattern_graph)
+            for match_filter in match_filters
+        )
+    ]
+
+    replacement_placeholders = [
+        n for n in replacement_graph.nodes if n.op == "placeholder"
+    ]
+
+    # As we progressively replace nodes, we'll need to keep track of how the match results should change
+    match_changed_node: Dict[Node, Node] = {}
+
+    match_and_replacements = []
+    for match in _matches:
+
+        # Build connecting between replacement graph's input and original graph input producer node
+
+        # Initialize `val_map` with mappings from placeholder nodes in
+        # `replacement` to their corresponding node in `original_graph`
+        assert len(match.placeholder_nodes) == len(replacement_placeholders)
+        val_map: Dict[Node, Node] = {}
+        for rn, gn in zip(replacement_placeholders, match.placeholder_nodes):
+            if isinstance(gn, Node):
+                val_map[rn] = match_changed_node.get(gn, gn)
+            else:
+                val_map[rn] = gn
+
+        # Copy the replacement graph over
+        user_nodes: Set[Node] = set()
+        for n in match.returning_nodes:
+            for user in n.users:
+                user_nodes.add(user)
+        assert user_nodes, "The returning_nodes should have at least one user node"
+
+        if len(user_nodes) == 1:
+            first_user_node = list(user_nodes)[0]
+        else:
+            # If there are multiple user nodes, we need to find the first user node
+            # in the current execution order of the `original_graph`
+            for n in original_graph.nodes:
+                if n in user_nodes:
+                    first_user_node = n
+                    break
+
+        with original_graph.inserting_before(first_user_node):
+            copied_returning_nodes = original_graph.graph_copy(
+                replacement_graph, val_map
+            )
+
+        if isinstance(copied_returning_nodes, Node):
+            copied_returning_nodes = (copied_returning_nodes,)
+
+        # Get a list of nodes that have been replaced into the graph
+        replacement_nodes = []
+
+        def get_replacement_nodes(curr_node: Node):
+            nonlocal replacement_nodes
+            for arg in curr_node.args:
+                if isinstance(arg, Node):
+                    if arg not in val_map.values():
+                        get_replacement_nodes(arg)
+            replacement_nodes.append(curr_node)
+
+        for ret_node in copied_returning_nodes:
+            get_replacement_nodes(ret_node)
+
+        # Hook the output Node of the replacement subgraph in to the
+        # original Graph at the correct location
+        assert len(match.returning_nodes) == len(copied_returning_nodes)
+        for gn, copied_node in zip(match.returning_nodes, copied_returning_nodes):
+            gn.replace_all_uses_with(copied_node)
+            match_changed_node[gn] = copied_node
+        # Remove the original nodes
+        logger.info(f"Remove pattern node from original graph, match={match}")
+        for node in reversed(pattern_graph.nodes):
+            if (
+                node.op != "placeholder"
+                and node.op != "output"
+                and node.target != torch.ops.aten.sym_size
+            ):
+
+                gn = match.nodes_map[node]
+                gm.graph.erase_node(gn)
+        match_and_replacements.append(
+            ReplacedPatterns(
+                anchor=match.anchors[0],
+                nodes_map=match.nodes_map,
+                replacements=replacement_nodes,
+            )
+        )
+
+    # Update the passed-in GraphModule to reflect the new state of
+    # `original_graph`
+    gm.graph.eliminate_dead_code()
+    gm.recompile()
+    # If `replacement` was an nn.Module, we'll need to make sure that
+    # all the submodules have been copied over correctly
+    # if isinstance(replacement, torch.nn.Module):
+    #     _replace_submodules(gm, replacement)
+
+    return match_and_replacements
diff --git a/fx2ait/fx2ait/tools/common_aten2ait.py b/fx2ait/fx2ait/tools/common_aten2ait.py
new file mode 100644
index 000000000..1a603848a
--- /dev/null
+++ b/fx2ait/fx2ait/tools/common_aten2ait.py
@@ -0,0 +1,428 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import uuid
+from enum import Enum
+from typing import Callable, List, Optional, Set
+from unittest import TestCase
+
+# executorch
+import executorch.exir as exir
+import torch
+from aitemplate.compiler.public import DynamicProfileStrategy
+from executorch.exir import CaptureConfig
+
+from fx2ait.ait_module import AITModule
+from fx2ait.fx2ait import AITInterpreter
+
+from fx2ait.passes.lower_basic_pass_aten import (
+    compose_bmm,
+    compose_chunk,
+    compose_getitem_slice,
+    remove_ops,
+    replace_aten_op_with_indices,
+    replace_aten_reshape_alias_with_replace,
+    # replace_batch_norm,  # it is needed if enable_aot=True in tracer
+    replace_builtin_ops,
+    replace_native_layernorm_with_layernorm,
+    replace_transpose_mm_op_with_linear,
+    run_const_fold,
+)
+from fx2ait.tensor_spec import TensorSpec
+
+_LOGGER = logging.getLogger(__name__)
+torch.ops.load_library("//deeplearning/ait:AITModel")
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    INT8 = "int8"
+
+
+def fetch_attr(mod, target):
+    """
+    Fetch an attribute from the ``Module`` hierarchy of ``mod.module``.
+
+    Args:
+        target (str): The fully-qualfiied name of the attribute to fetch
+
+    Return:
+        Any: The value of the attribute.
+    """
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip because CUDA is not available")
+class DispatchTestCase(TestCase):
+    def generate_graph(
+        self,
+        mod: torch.nn.Module,
+        original_inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        customized_passes: List[Callable] = None,
+    ):
+        # Torchdynamo+aot proxytensor tracer
+        # Below are common passes
+        passes_list = [
+            compose_bmm,
+            compose_chunk,
+            compose_getitem_slice,
+            replace_aten_reshape_alias_with_replace,
+            replace_aten_op_with_indices,
+            replace_transpose_mm_op_with_linear,  # after compose_bmm
+            replace_native_layernorm_with_layernorm,
+            remove_ops,
+            replace_builtin_ops,  # after replace_native_layernorm_with_layernorm
+        ]
+        # Combine with customized passes specific to any model
+        if customized_passes:
+            passes_list.extend(customized_passes)
+
+        fx_module = (
+            exir.capture(
+                mod,
+                tuple(original_inputs),
+                CaptureConfig(
+                    pt2_mode=True,
+                    enable_functionalization=False,
+                    enable_dynamic_shape=True,
+                    _use_old_decomp_table=True,
+                ),
+            )
+            .transform(*tuple(passes_list))
+            .exported_program.graph_module
+        )
+
+        fx_module = run_const_fold(fx_module)
+        _LOGGER.info(f"aten fx graph: {fx_module.graph}")
+
+        if len(expected_ops):
+            self.assert_has_op(fx_module, expected_ops)
+        if unexpected_ops:
+            self.assert_unexpected_op(fx_module, unexpected_ops)
+
+        return fx_module
+
+    def run_test(
+        self,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        customized_passes: List[Callable] = None,
+    ):
+        mod.eval()
+        original_inputs = inputs
+        if permute_inputs:
+            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        fx_module = self.generate_graph(
+            mod, original_inputs, expected_ops, unexpected_ops, customized_passes
+        )
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs,
+            "/tmp",
+            f"test-aten2ait-{uuid.uuid1()}",
+        )
+        interp_result = interp.run()
+        ait_mod_run = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            ),
+            interp_result,
+        )
+
+        # Inference run and results comparison
+        with torch.no_grad():
+            # reference run
+            ref_outputs = mod(*original_inputs)
+            # ait run
+            cuda_inputs = []
+            for i in inputs:
+                cuda_inputs.append(i.cuda())
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod_run(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            print("AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3))
+
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+                if permute_outputs:
+                    out = out.permute(*permute_outputs)
+                torch.testing.assert_close(
+                    out.cpu(),
+                    ref,
+                    rtol=rtol,
+                    atol=atol,
+                    check_dtype=False,
+                    equal_nan=True,
+                )
+
+    def run_test_with_dynamic_shape(
+        self,
+        mod: torch.nn.Module,
+        inputs_spec: List[TensorSpec],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        customized_passes: List[Callable] = None,
+        dynamic_profile_strategy=DynamicProfileStrategy.MAX,
+        specify_num: Optional[float] = None,
+    ):
+        mod.eval()
+        inputs_list = []
+        for use_lower_bound in [True, False]:
+            inputs_list.append(
+                TensorSpec.create_inputs_from_specs(
+                    inputs_spec,
+                    use_lower_bound=use_lower_bound,
+                    specify_num=specify_num,
+                )
+            )
+        inputs = inputs_list[0]
+
+        fx_module = self.generate_graph(
+            mod, inputs, expected_ops, unexpected_ops, customized_passes
+        )
+
+        if permute_inputs:
+            for inp in inputs_spec:
+                shape = []
+                for i in permute_inputs:
+                    shape.append(inp.shape[i])
+                inp.shape = shape
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs_spec,
+            "/tmp",
+            f"test-aten2ait-{uuid.uuid1()}",
+            dynamic_profile_strategy=dynamic_profile_strategy,
+        )
+        interp_result = interp.run()
+        ait_mod_run = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            ),
+            interp_result,
+        )
+
+        for inputs in inputs_list:
+            with torch.no_grad():
+                ref_outputs = mod(*inputs)
+                # ait run
+                cuda_inputs = []
+                # reference run
+                if permute_inputs:
+                    inputs = [
+                        inp.permute(*permute_inputs).contiguous() for inp in inputs
+                    ]
+                for i in inputs:
+                    cuda_inputs.append(i.cuda())
+                torch.cuda.synchronize()
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                start_event.record()
+                outputs = ait_mod_run(*cuda_inputs)
+                end_event.record()
+                torch.cuda.synchronize()
+                print(
+                    "AIT run time(s)=", (start_event.elapsed_time(end_event) * 1.0e-3)
+                )
+
+                if isinstance(outputs, torch.Tensor):
+                    ref_outputs = [ref_outputs]
+                    outputs = [outputs]
+                for out, ref in zip(outputs, ref_outputs):
+                    if not isinstance(ref, torch.Tensor):
+                        ref = torch.tensor([ref])
+                    ref = ref.cpu()  # to_dtype test has cases with gpu output
+                    if permute_outputs:
+                        out = out.permute(*permute_outputs)
+
+                    torch.testing.assert_close(
+                        out.cpu(),
+                        ref,
+                        rtol=rtol,
+                        atol=atol,
+                        check_dtype=False,
+                        equal_nan=True,
+                    )
+
+    def assert_has_op(self, mod, ops):
+        ops_in_mod = set()
+
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                ops_in_mod.add(type(fetch_attr(mod, node.target)))
+            elif node.op in {"call_function", "call_method"}:
+                ops_in_mod.add(node.target)
+
+        self.assertTrue(
+            ops_in_mod >= ops, f"expected ops {ops}, actuall ops {ops_in_mod}"
+        )
+
+    def assert_unexpected_op(self, mod, ops):
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                if type(fetch_attr(mod, node.target)) in ops:
+                    return False
+            elif node.op in {"call_function", "call_method"}:
+                if node.target in ops:
+                    return False
+        return True
+
+    def benchmark_function(
+        self,
+        name: str,
+        iters: int,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        permute_inputs: Optional[List[int]] = None,
+        customized_passes: Optional[List[int]] = None,
+    ) -> float:
+        mod.eval()
+        original_inputs = inputs
+        if permute_inputs:
+            inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+
+        fx_module = self.generate_graph(
+            mod, original_inputs, {}, customized_passes=customized_passes
+        )
+
+        interp = AITInterpreter(
+            fx_module,
+            inputs,
+            "/tmp",
+            f"benchmark-fx2ait-{uuid.uuid1()}",
+        )
+
+        def benchmark(f, args):
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            print("== Start benchmark iterations")
+            with torch.inference_mode():
+                start_event.record()
+                for _ in range(iters):
+                    f(*args)
+                end_event.record()
+            torch.cuda.synchronize()
+            print("== End benchmark iterations")
+            time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+            return time_per_iter_ms
+
+        with torch.inference_mode():
+            interp_result = interp.run()
+            ait_mod = AITModule(
+                torch.classes.fb.AITModel(
+                    interp_result.engine.lib_path,
+                    interp_result.input_names,
+                    interp_result.output_names,
+                    torch.float16,
+                    torch.float,
+                    1,  #  num_runtimes
+                ),
+                interp_result,
+            )
+            # Benchmark Pytorch Eager
+            # warmup
+            for _ in range(10):
+                mod(*original_inputs)
+            batch_size = inputs[0].shape[0]
+            pt_time_per_iter_ms = benchmark(mod, original_inputs)
+            pt_qps = batch_size / pt_time_per_iter_ms
+
+            # Benchmark FX2AIT
+            cuda_inputs = []
+            for i in inputs:
+                cuda_inputs.append(i.cuda())
+            # warmup
+            for _ in range(10):
+                ait_mod(*cuda_inputs)
+
+            ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+            ait_qps = batch_size / ait_time_per_iter_ms
+
+            result = (
+                f"== Benchmark Result for: {name}\n"
+                f"BS: {batch_size}, "
+                f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+                f"PT Eager QPS: {pt_qps:.2f}, "
+                f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+                f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+                f"Speedup: {ait_qps/pt_qps:.2f}, "
+            )
+            with open("/tmp/bench_" + name + ".csv", "a") as f:
+                f.write(
+                    ",".join(
+                        map(
+                            str,
+                            [
+                                name,
+                                batch_size,
+                                pt_time_per_iter_ms,
+                                pt_qps,
+                                ait_time_per_iter_ms,
+                                ait_qps,
+                                ait_qps / pt_qps,
+                            ],
+                        )
+                    )
+                    + "\n"
+                )
+            return result
diff --git a/fx2ait/fx2ait/tools/common_fx2ait.py b/fx2ait/fx2ait/tools/common_fx2ait.py
new file mode 100644
index 000000000..4d25997a1
--- /dev/null
+++ b/fx2ait/fx2ait/tools/common_fx2ait.py
@@ -0,0 +1,485 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import copy
+import logging
+import time
+import unittest
+import uuid
+from enum import Enum
+from typing import Callable, List, Optional, Set
+from unittest import TestCase
+
+import torch
+from aitemplate.testing import detect_target
+from fx2ait.acc_tracer import acc_tracer
+from fx2ait.acc_tracer.ait_acc_normalizer import update_acc_op_mappers_for_ait
+from fx2ait.ait_module import AITModule
+from fx2ait.extension import is_oss_ait_model
+from fx2ait.fx2ait import AITInterpreter
+from fx2ait.tensor_spec import TensorSpec
+from torch.fx.node import map_aggregate
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class LowerPrecision(Enum):
+    FP32 = "fp32"
+    FP16 = "fp16"
+    BF16 = "bf16"
+    INT8 = "int8"
+
+
+def lower_precision_to_torch_type(
+    precision: LowerPrecision,
+) -> torch.dtype:
+    if precision == LowerPrecision.FP16:
+        return torch.float16
+    elif precision == LowerPrecision.BF16:
+        return torch.bfloat16
+    elif precision == LowerPrecision.FP32:
+        return torch.float
+    elif precision == LowerPrecision.INT8:
+        return torch.int8
+    else:
+        raise ValueError(f"Unsupported precision: {precision}")
+
+
+def fetch_attr(mod, target):
+    """
+    Fetch an attribute from the ``Module`` hierarchy of ``mod.module``.
+
+    Args:
+        target (str): The fully-qualfiied name of the attribute to fetch
+
+    Return:
+        Any: The value of the attribute.
+    """
+    target_atoms = target.split(".")
+    attr_itr = mod
+    for i, atom in enumerate(target_atoms):
+        if not hasattr(attr_itr, atom):
+            raise RuntimeError(
+                f"Node referenced nonexistent target {'.'.join(target_atoms[:i])}"
+            )
+        attr_itr = getattr(attr_itr, atom)
+    return attr_itr
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Skip because CUDA is not available")
+class AITTestCase(TestCase):
+    def setUp(self):
+        super().setUp()
+        torch.manual_seed(3)
+        detect_target()
+
+    def run_test(
+        self,
+        mod: torch.nn.Module,
+        inputs: List[torch.Tensor],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        permute_inputs: Optional[List[int]] = None,
+        permute_outputs: Optional[List[int]] = None,
+        passes: List[Callable] = [],  # noqa: B006
+        leaf_module: Callable = None,  # one leaf module
+        apply_passes_to_lowered_module_only=False,
+        use_fp16_acc=True,
+        fail_on_nan=False,
+    ):
+        # TODO: add precision to interpreter once AIT supports multiple precision level
+        # TODO: @qxy11 remove permute options once AIT supports channels-first format
+        mod.eval()
+
+        leaf_module_list = []
+        if leaf_module:
+            if isinstance(leaf_module, list):
+                leaf_module_list.extend(leaf_module)
+            else:
+                leaf_module_list.append(leaf_module)
+
+        orig_mod = copy.deepcopy(mod)
+        orig_mod.eval()
+        mod = acc_tracer.trace(
+            mod,
+            inputs,
+            leaf_module_list=leaf_module_list,
+        )
+        for p in passes:
+            mod = p(mod, inputs)
+
+        logger.info(f"{mod.graph}")
+
+        original_inputs = copy.deepcopy(inputs)
+        if permute_inputs:
+            inputs = map_aggregate(
+                inputs, lambda inp: inp.permute(*permute_inputs).contiguous()
+            )
+
+        torch_dtype = lower_precision_to_torch_type(precision)
+        mod.to(torch_dtype)
+        inputs = map_aggregate(
+            inputs,
+            lambda inp: inp.to(torch_dtype).contiguous()
+            if inp.dtype not in (torch.bool, torch.int64)
+            else inp.contiguous(),
+        )
+        interp = AITInterpreter(
+            mod,
+            inputs,
+            "/tmp",
+            f"test-fx2ait-{uuid.uuid1()}",
+            use_fp16_acc=use_fp16_acc,
+        )
+        with torch.no_grad():
+            cuda_inputs = map_aggregate(inputs, lambda inp: inp.cuda())
+
+            mod.eval()
+            if apply_passes_to_lowered_module_only:
+                ref_outputs = orig_mod(*original_inputs)
+            else:
+                ref_outputs = mod(*original_inputs)
+            if len(expected_ops):
+                self.assert_has_op(mod, expected_ops)
+            if unexpected_ops:
+                self.assert_unexpected_op(mod, unexpected_ops)
+            start = time.perf_counter()
+            interp_result = interp.run()
+            sec = time.perf_counter() - start
+            logger.info(f"Interpreter run time(s):{sec}")
+            if is_oss_ait_model():
+                ait_mod = AITModule(
+                    torch.classes.ait.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch_dtype,
+                        torch.float,
+                        1,  #  num_runtimes
+                    ),
+                    interp_result,
+                )
+            else:
+                ait_mod = AITModule(
+                    torch.classes.fb.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch_dtype,
+                        torch.float,
+                        1,  #  num_runtimes
+                    ),
+                    interp_result,
+                )
+
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            logger.info(
+                f"AIT run time(s)={start_event.elapsed_time(end_event) * 1.0e-3}"
+            )
+            # PyTorch Transformer model would yield 2 output tensors, of which the second one is
+            # not useful. AIT model only output 1 output tensor, alter ref_output to match this.
+            if leaf_module == torch.nn.MultiheadAttention:
+                ref_outputs = ref_outputs[0]
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+                if permute_outputs:
+                    out = map_aggregate(
+                        out, lambda output: output.permute(*permute_outputs)
+                    )
+                out = out.cpu()
+                if out.numel() != 0:
+                    max_diff = torch.max(torch.abs(out - ref)).item()
+                    logger.info(f"Max diff = {max_diff}")
+                torch.testing.assert_close(
+                    out,
+                    ref,
+                    rtol=rtol,
+                    atol=atol,
+                    check_dtype=False,
+                    equal_nan=not fail_on_nan,
+                )
+
+    def run_test_with_dynamic_shape(
+        self,
+        mod: torch.nn.Module,
+        inputs_spec: List[TensorSpec],
+        expected_ops: Set[Callable],
+        unexpected_ops: Optional[Set[Callable]] = None,
+        rtol: float = 1e-02,
+        atol: float = 1e-02,
+        precision: LowerPrecision = LowerPrecision.FP16,
+        passes: List[Callable] = [],  # noqa: B006
+        leaf_module: Callable = None,  # one leaf module
+        inputs_override: List[
+            List[torch.Tensor]
+        ] = None,  # For cases we can not generate inputs with existing tensor spec interface
+    ):
+        mod.eval()
+        leaf_module_list = []
+        if leaf_module:
+            leaf_module_list.append(leaf_module)
+
+        if inputs_override:
+            inputs_min = inputs_override[0]
+            inputs_max = inputs_override[1]
+        else:
+            inputs_list = []
+            for use_lower_bound in [True, False]:
+                inputs_list.append(
+                    TensorSpec.create_inputs_from_specs(
+                        inputs_spec, use_lower_bound=use_lower_bound
+                    )
+                )
+
+            inputs_min = inputs_list[0]
+            inputs_max = inputs_list[1]
+        mod.eval()
+        mod = acc_tracer.trace(
+            mod,
+            inputs_min,
+            leaf_module_list=leaf_module_list,
+        )
+        for p in passes:
+            mod = p(mod, inputs_min)
+        logger.info(f"{mod.graph}")
+
+        original_inputs = inputs_min
+        # Trace and test with inputs_min
+        interp = AITInterpreter(
+            mod,
+            inputs_spec,
+            "/tmp",
+            f"test-fx2ait-{uuid.uuid1()}",
+        )
+        with torch.no_grad():
+            cuda_inputs = []
+            for i in inputs_min:
+                cuda_inputs.append(i.cuda())
+
+            mod.eval()
+            if len(expected_ops):
+                self.assert_has_op(mod, expected_ops)
+            if unexpected_ops:
+                self.assert_unexpected_op(mod, unexpected_ops)
+            start = time.perf_counter()
+            interp_result = interp.run()
+            sec = time.perf_counter() - start
+            logger.info(f"Interpreter run time(s):{sec}")
+            if is_oss_ait_model():
+                ait_mod = AITModule(
+                    torch.classes.ait.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    ),
+                    interp_result,
+                )
+            else:
+                ait_mod = AITModule(
+                    torch.classes.fb.AITModel(
+                        interp_result.engine.lib_path,
+                        interp_result.input_names,
+                        interp_result.output_names,
+                        torch.float16,
+                        torch.float,
+                        1,  #  num_runtimes
+                    ),
+                    interp_result,
+                )
+
+            ref_outputs = mod(*original_inputs)
+
+            torch.cuda.synchronize()
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+            outputs = ait_mod(*cuda_inputs)
+            end_event.record()
+            torch.cuda.synchronize()
+            logger.info(
+                f"AIT run time(s)={start_event.elapsed_time(end_event) * 1.0e-3}"
+            )
+
+            if isinstance(outputs, torch.Tensor):
+                ref_outputs = [ref_outputs]
+                outputs = [outputs]
+            for out, ref in zip(outputs, ref_outputs):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+
+                torch.testing.assert_close(
+                    out.cpu(), ref, rtol=rtol, atol=atol, check_dtype=False
+                )
+
+            # To test dynamic shape, we test it again with inputs_max
+            ref_outputs_max = mod(*inputs_max)
+            for i in inputs_max:
+                cuda_inputs_max = [i.cuda() for i in inputs_max]
+            outputs_max = ait_mod(*cuda_inputs_max)
+            if isinstance(outputs_max, torch.Tensor):
+                ref_outputs_max = [ref_outputs_max]
+                outputs_max = [outputs_max]
+            for out, ref in zip(outputs_max, ref_outputs_max):
+                if not isinstance(ref, torch.Tensor):
+                    ref = torch.tensor([ref])
+                ref = ref.cpu()  # to_dtype test has cases with gpu output
+
+                torch.testing.assert_close(
+                    out.cpu(), ref, rtol=rtol, atol=atol, check_dtype=False
+                )
+
+    def assert_has_op(self, mod, ops):
+        ops_in_mod = set()
+
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                ops_in_mod.add(type(fetch_attr(mod, node.target)))
+            elif node.op in {"call_function", "call_method"}:
+                ops_in_mod.add(node.target)
+
+        self.assertTrue(
+            ops_in_mod >= ops, f"expected ops {ops}, actuall ops {ops_in_mod}"
+        )
+
+    def assert_unexpected_op(self, mod, ops):
+        for node in mod.graph.nodes:
+            if node.op == "call_module":
+                if type(fetch_attr(mod, node.target)) in ops:
+                    return False
+            elif node.op in {"call_function", "call_method"}:
+                if node.target in ops:
+                    return False
+        return True
+
+
+def benchmark_function(
+    name: str,
+    iters: int,
+    mod: torch.nn.Module,
+    inputs: List[torch.Tensor],
+    permute_inputs: Optional[List[int]] = None,
+) -> float:
+    mod.eval()
+    mod = acc_tracer.trace(
+        mod,
+        inputs,
+    )
+    original_inputs = inputs
+    if permute_inputs:
+        inputs = [inp.permute(*permute_inputs).contiguous() for inp in inputs]
+    interp = AITInterpreter(
+        mod,
+        inputs,
+        "/tmp",
+        f"benchmark-fx2ait-{uuid.uuid1()}",
+    )
+
+    def benchmark(f, args):
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        logger.info("== Start benchmark iterations")
+        with torch.inference_mode():
+            start_event.record()
+            for _ in range(iters):
+                f(*args)
+            end_event.record()
+        torch.cuda.synchronize()
+        logger.info("== End benchmark iterations")
+        time_per_iter_ms = (start_event.elapsed_time(end_event) * 1.0e-3) / iters
+        return time_per_iter_ms
+
+    with torch.inference_mode():
+        interp_result = interp.run()
+        ait_mod = AITModule(
+            torch.classes.fb.AITModel(
+                interp_result.engine.lib_path,
+                interp_result.input_names,
+                interp_result.output_names,
+                torch.float16,
+                torch.float,
+                1,  #  num_runtimes
+            ),
+            interp_result,
+        )
+        # Benchmark Pytorch Eager
+        # warmup
+        for _ in range(10):
+            mod(*original_inputs)
+        batch_size = inputs[0].shape[0]
+        pt_time_per_iter_ms = benchmark(mod, original_inputs)
+        pt_qps = batch_size / pt_time_per_iter_ms
+
+        # Benchmark FX2AIT
+        cuda_inputs = []
+        for i in inputs:
+            cuda_inputs.append(i.cuda())
+        # warmup
+        for _ in range(10):
+            ait_mod(*cuda_inputs)
+
+        ait_time_per_iter_ms = benchmark(ait_mod, cuda_inputs)
+        ait_qps = batch_size / ait_time_per_iter_ms
+
+        result = (
+            f"== Benchmark Result for: {name}\n"
+            f"BS: {batch_size}, "
+            f"PT Eager time per iter: {pt_time_per_iter_ms}ms, "
+            f"PT Eager QPS: {pt_qps:.2f}, "
+            f"FX2AIT time per iter: {ait_time_per_iter_ms}ms, "
+            f"FX2AIT Eager QPS: {ait_qps:.2f}, "
+            f"Speedup: {ait_qps/pt_qps:.2f}, "
+        )
+
+        with open("/tmp/bench_" + name + ".csv", "a") as f:
+            f.write(
+                ",".join(
+                    map(
+                        str,
+                        [
+                            name,
+                            batch_size,
+                            pt_time_per_iter_ms,
+                            pt_qps,
+                            ait_time_per_iter_ms,
+                            ait_qps,
+                            ait_qps / pt_qps,
+                        ],
+                    )
+                )
+                + "\n"
+            )
+        return result
+
+
+update_acc_op_mappers_for_ait()
diff --git a/fx2ait/fx2ait/utils.py b/fx2ait/fx2ait/utils.py
new file mode 100644
index 000000000..a87e2abaf
--- /dev/null
+++ b/fx2ait/fx2ait/utils.py
@@ -0,0 +1,31 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.utils.torch_utils import torch_dtype_to_string
+
+
+def dtype_to_str(dtype):
+    if dtype is None:
+        return "float16"
+    return torch_dtype_to_string(dtype)
+
+
+def make_str_ait_friendly(s: str) -> str:
+    if s.isalnum():
+        ret = s
+    else:
+        ret = "".join(c if c.isalnum() else "_" for c in s)
+    if ret[0].isdigit():
+        ret = "_" + ret
+    return ret
diff --git a/fx2ait/setup.py b/fx2ait/setup.py
new file mode 100644
index 000000000..d3c185ee1
--- /dev/null
+++ b/fx2ait/setup.py
@@ -0,0 +1,79 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import glob
+import os
+
+from setuptools import find_packages, setup
+
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+def get_extensions():
+    print("Compiling extensions with following flags:")
+    debug_mode = os.getenv("DEBUG", "0") == "1"
+    print(f"  DEBUG: {debug_mode}")
+    nvcc_flags = os.getenv("NVCC_FLAGS", "")
+    print(f"  NVCC_FLAGS: {nvcc_flags}")
+    if nvcc_flags == "":
+        nvcc_flags = []
+    else:
+        nvcc_flags = nvcc_flags.split(" ")
+    extra_compile_args = {"cxx": [], "nvcc": nvcc_flags}
+
+    if debug_mode:
+        print("Compiling in debug mode")
+        extra_compile_args["cxx"].append("-g")
+        extra_compile_args["cxx"].append("-O0")
+        if "nvcc" in extra_compile_args:
+            # we have to remove "-OX" and "-g" flag if exists and append
+            nvcc_flags = extra_compile_args["nvcc"]
+            extra_compile_args["nvcc"] = [
+                f for f in nvcc_flags if not ("-O" in f or "-g" in f)
+            ]
+            extra_compile_args["nvcc"].append("-O0")
+            extra_compile_args["nvcc"].append("-g")
+
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "fx2ait", "csrc")
+
+    src = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    inc = [extensions_dir]
+    inc += [os.path.abspath(os.path.join(this_dir, "../static/include"))]
+    inc += [os.path.abspath(os.path.join(this_dir, "../3rdparty/picojson"))]
+    define_macros = []
+
+    ext_modules = [
+        CUDAExtension(
+            name="fx2ait.libait_model",
+            sources=src,
+            include_dirs=inc,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+
+setup(
+    name="fx2ait",
+    version="0.2.dev1",
+    description="FX2AIT: Convert PyTorch Models to AITemplate",
+    zip_safe=False,
+    install_requires=["torch"],  # We will need torch>=1.13
+    packages=find_packages(),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": BuildExtension.with_options(no_python_abi_suffix=True)},
+)
diff --git a/python/aitemplate/__init__.py b/python/aitemplate/__init__.py
index 9adca1347..2a449b9fc 100644
--- a/python/aitemplate/__init__.py
+++ b/python/aitemplate/__init__.py
@@ -14,8 +14,9 @@
 #
 import sys
 
-from . import backend, compiler, frontend, testing, utils
-from ._libinfo import __version__  # noqa
+from aitemplate import backend, compiler, frontend, testing, utils
+from aitemplate._libinfo import __version__  # noqa
+from aitemplate.utils.misc import setup_logger
 
 if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 7):
     PY3STATEMENT = "The minimal Python requirement is Python 3.7"
@@ -23,4 +24,4 @@
 
 __all__ = ["backend", "compiler", "frontend", "testing", "utils"]
 
-root_logger = utils.logger.setup_logger(__name__)
+root_logger = setup_logger(__name__)
diff --git a/python/aitemplate/_libinfo.py b/python/aitemplate/_libinfo.py
index ca4d89ecb..c324afca9 100644
--- a/python/aitemplate/_libinfo.py
+++ b/python/aitemplate/_libinfo.py
@@ -14,4 +14,4 @@
 #
 # current version
 # We use the version of the incoming release for code
-__version__ = "0.1.dev1"
+__version__ = "0.3.dev0"
diff --git a/python/aitemplate/backend/__init__.py b/python/aitemplate/backend/__init__.py
index 8e7aaca0d..df7240114 100644
--- a/python/aitemplate/backend/__init__.py
+++ b/python/aitemplate/backend/__init__.py
@@ -15,7 +15,7 @@
 """
 Backend for AITemplate.
 """
-from . import (  # noqa
+from aitemplate.backend import (  # noqa
     backend_spec,
     builder,
     codegen,
diff --git a/python/aitemplate/backend/backend_spec.py b/python/aitemplate/backend/backend_spec.py
index 62fd07ade..0147c9ea1 100644
--- a/python/aitemplate/backend/backend_spec.py
+++ b/python/aitemplate/backend/backend_spec.py
@@ -18,12 +18,13 @@
 
 from dataclasses import dataclass, field
 
-from typing import Dict, List, Tuple
+from typing import Dict, List
 
 import jinja2
 
-from ..compiler.ops.common.epilogue import FuncEnum
-from .target import Target
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 
 
 class BackendSpec:
@@ -52,37 +53,66 @@ class GPUBackendSpec(BackendSpec):
 
     dtype_to_backend_dtype: Dict[str, str] = field(
         default_factory=lambda: {
+            "bool": "bool",
             "float16": "half",
+            "bfloat16": "bfloat16",
+            "float32": "float",
             "float": "float",
             "int64": "int64_t",
+            "int32": "int32_t",
+            "bool": "bool",
         }
     )
 
-    backend_datatype_convertors: Dict[str, Dict[str, str]] = field(
+    # find the size in bytes of a given backend type
+    sizeof_types: Dict[str, int] = field(
         default_factory=lambda: {
-            "half": {"float": "__half2float"},
-            "float": {"half": "__float2half_rn"},
+            "bool": 1,
+            "uint8_t": 1,
+            "half": 2,
+            "bfloat16": 2,
+            "float32": 4,
+            "int64_t": 8,
+            "int32_t": 4,
+            "float": 4,
+            "bool": 1,
+            "uint4": 16,
+            "uint2": 8,
+            "uint": 4,
+            "bfloat16": 2,
         }
     )
 
-    read_num_elements_to_backend_type: List[Tuple[int, str]] = field(
-        default_factory=lambda: [
-            (8, "uint4"),
-            (4, "uint2"),
-            (2, "uint"),
-            (1, "half"),
-        ]
+    # find a backend type for a given size in bytes
+    # useful to find types 2 or 4 times larger than a given dtype
+    # for vectorization purposes.
+    type_for_size: Dict[int, str] = field(
+        default_factory=lambda: {
+            1: "uint8_t",
+            2: "half",
+            4: "float",
+            8: "int64_t",
+            16: "int4",
+        }
     )
-    op_num_elements_to_backend_type: List[Tuple[int, str]] = field(
-        default_factory=lambda: [
-            (2, "half2"),
-            (1, "half"),
-        ]
+
+    backend_datatype_convertors: Dict[str, Dict[str, str]] = field(
+        default_factory=lambda: {
+            "half": {"float": "__half2float"},
+            "bfloat16": {"float": "__bfloat162float"},
+            "float": {
+                "half": "__float2half_rn",
+                "bfloat16": "__float2bfloat16_rn",
+            },
+        }
     )
+
     op_type_priority_list: List[str] = field(
         default_factory=lambda: [
             "half2",
             "half",
+            "bfloat16_2",
+            "bfloat16",
             "float",
         ]
     )
@@ -90,141 +120,272 @@ class GPUBackendSpec(BackendSpec):
         default_factory=lambda: {
             FuncEnum.ADD: {
                 "half2": "__hadd2",
+                "bfloat16_2": "__hadd2",
                 "half": "__hadd",
+                "bfloat16": "__hadd",
                 "float": "__fadd_rn",
             },
             FuncEnum.SUB: {
                 "half2": "__hsub2",
+                "bfloat16_2": "__hsub2",
                 "half": "__hsub",
+                "bfloat16": "__hsub",
                 "float": "__fsub_rn",
             },
             FuncEnum.MUL: {
                 "half2": "__hmul2",
+                "bfloat16_2": "__hmul2",
                 "half": "__hmul",
+                "bfloat16": "__hmul",
                 "float": "__fmul_rn",
             },
             FuncEnum.DIV: {
                 "half2": "__h2div",
+                "bfloat16_2": "__h2div",
                 "half": "__hdiv",
+                "bfloat16": "__hdiv",
                 "float": "__fdiv_rn",
             },
             FuncEnum.COS: {
                 "half2": "h2cos",
+                "bfloat16_2": "h2cos",
                 "half": "hcos",
+                "bfloat16": "hcos",
                 "float": "cosf",
             },
             FuncEnum.SIN: {
                 "half2": "h2sin",
+                "bfloat16_2": "h2sin",
                 "half": "hsin" if Target.current().name() == "cuda" else "hsin_custom",
+                "bfloat16": "hsin"
+                if Target.current().name() == "cuda"
+                else "hsin_custom",
                 "float": "sinf",
             },
             FuncEnum.TANH: {
                 "half2": "fast_tanh",
+                "bfloat16_2": "fast_tanh",
                 "half": "fast_tanh",
+                "bfloat16": "fast_tanh",
                 "float": "tanh",
             },
             FuncEnum.ABS: {
                 "half2": "__habs2",
+                "bfloat16_2": "__habs2",
                 "half": "__habs",
+                "bfloat16": "__habs",
                 "float": "fabsf",
             },
             FuncEnum.LOGE: {
                 "half2": "h2log",
+                "bfloat16_2": "h2log",
                 "half": "hlog",
+                "bfloat16": "hlog",
                 "float": "logf",
             },
             FuncEnum.EXP: {
                 "half2": "h2exp",
+                "bfloat16_2": "h2exp",
                 "half": "hexp",
+                "bfloat16": "hexp",
                 "float": "expf",
             },
             FuncEnum.SQRT: {
                 "half2": "h2sqrt",
+                "bfloat16_2": "h2sqrt",
                 "half": "hsqrt",
+                "bfloat16": "hsqrt",
                 "float": "sqrtf",
             },
             FuncEnum.MAX: {
                 "half2": "hmax2_nan",
+                "bfloat16_2": "hmax2_nan",
                 "half": "hmax_nan",
+                "bfloat16": "hmax_nan",
                 "float": "fmaxf_nan",
             },
             FuncEnum.MIN: {
                 "half2": "hmin2_nan",
+                "bfloat16_2": "hmin2_nan",
                 "half": "hmin_nan",
+                "bfloat16": "hmin_nan",
                 "float": "fminf_nan",
             },
             FuncEnum.SIGN: {
                 "half2": "h2sign_custom",
+                "bfloat16_2": "h2sign_custom",
                 "half": "sign_custom<half>",
+                "bfloat16": "sign_custom<bfloat16>",
                 "float": "sign_custom<float>",
             },
             FuncEnum.SIGMOID: {
                 "half2": "h2sigmoid_custom",
+                "bfloat16_2": "h2sigmoid_custom",
                 "half": "hsigmoid_custom",
+                "bfloat16": "hsigmoid_custom",
                 "float": "fsigmoid_custom",
             },
             FuncEnum.LRELU: {
                 "half2": "leaky_relu",
+                "bfloat16_2": "leaky_relu",
                 "half": "leaky_relu",
+                "bfloat16": "leaky_relu",
                 "float": "leaky_relu",
             },
             FuncEnum.HARDTANH: {
                 "half2": "h2hard_tanh",
-                "half": "hard_tanh<half>",
-                "float": "hard_tanh<float>",
+                "bfloat16_2": "h2hard_tanh",
+                "half": "hard_tanh",
+                "float": "hard_tanh",
+                "bfloat16": "hard_tanh",
+            },
+            FuncEnum.RELU: {
+                "half2": "relu",
+                "bfloat16_2": "relu",
+                "half": "relu",
+                "bfloat16": "relu",
+                "float": "relu",
             },
-            FuncEnum.RELU: {"half2": "relu", "half": "relu", "float": "relu"},
             FuncEnum.NAN_TO_NUM: {
                 "half2": "nan_to_num",
+                "bfloat16_2": "nan_to_num",
                 "half": "nan_to_num",
+                "bfloat16": "nan_to_num",
                 "float": "nan_to_num",
             },
             FuncEnum.CLAMP_NAN_TO_NUM: {
                 "half2": "clamp_nan_to_num",
+                "bfloat16_2": "clamp_nan_to_num",
                 "half": "clamp_nan_to_num",
+                "bfloat16": "clamp_nan_to_num",
                 "float": "clamp_nan_to_num",
             },
             FuncEnum.SILU: {
                 "half2": "h2silu",
+                "bfloat16_2": "h2silu",
                 "half": "hsilu",
+                "bfloat16": "hsilu",
                 "float": "fsilu",
             },
             FuncEnum.POW: {
                 "half2": "h2pow",
+                "bfloat16_2": "h2pow",
                 "half": "hpow",
+                "bfloat16": "hpow",
                 "float": "fpow",
             },
             FuncEnum.GELU: {
                 "half": "hgelu",
+                "bfloat16": "hgelu",
                 "float": "fgelu",
             },
             FuncEnum.FASTGELU: {
                 "half": "h_fast_gelu",
+                "bfloat16": "h_fast_gelu",
                 "float": "f_fast_gelu",
             },
             FuncEnum.SOFTPLUS: {
                 "half2": "h2softplus",
+                "bfloat16_2": "h2softplus",
                 "half": "hsoftplus",
+                "bfloat16": "hsoftplus",
                 "float": "fsoftplus",
             },
+            FuncEnum.ELU: {
+                "half2": "h2elu",
+                "bfloat16_2": "h2elu",
+                "half": "helu",
+                "bfloat16": "helu",
+                "float": "felu",
+            },
+            FuncEnum.SOFTSIGN: {
+                "float": "fsoftsign",
+                "half": "hsoftsign",
+                "half2": "h2softsign",
+                "bfloat16": "hsoftsign",
+                "bfloat16_2": "h2softsign",
+            },
+            FuncEnum.FLOOR_DIV: {
+                "float": "floor_div",
+                "half": "floor_div",
+                "half2": "floor_div",
+                "bfloat16": "floor_div",
+                "bfloat16_2": "floor_div",
+            },
+            FuncEnum.CELU: {
+                "float": "fcelu",
+                "half": "hcelu",
+                "half2": "h2celu",
+                "bfloat16": "hcelu",
+                "bfloat16_2": "h2celu",
+            },
         }
     )
 
-    def get_backend_type(
+    def get_elementwise_op_backend_type(
         self,
         num_elements: int,
         dtype: str,
-        num_elements_to_backend_type_list: List[Tuple[int, str]],
     ) -> str:
-        if dtype not in ("float16", "float"):
+        """
+        Get a backend type execution in elementwise ops.
+        For example, if we're dealing with fp16, we might be able to use half2 if num_elements is divisible by 2.
+        """
+        if dtype in ("float", "float32"):
+            return "float"
+        elif dtype == "float16":
+            if num_elements % 2 == 0:
+                return "half2"
+            else:
+                return "half"
+        elif dtype == "bfloat16":
+            if num_elements % 2 == 0:
+                return "bfloat16_2"
+            else:
+                return "bfloat16"
+        raise NotImplementedError("Unsupported dtype {}!".format(dtype))
+
+    def get_elementwise_read_backend_type(
+        self,
+        num_elements: int,
+        dtype: str,
+    ) -> str:
+        """
+        Get a backend type for reading in elementwise ops.
+        For example, if we're dealing with fp16 and num_elements is divisible by 8,
+        we can use uint4.
+        """
+        if dtype in ("float", "float32", "int32"):
+            num_elems_to_backend_type = ((4, "uint4"), (2, "uint2"), (1, "float"))
+
+        elif dtype == "float16":
+            num_elems_to_backend_type = (
+                (8, "uint4"),
+                (4, "uint2"),
+                (2, "uint"),
+                (1, "half"),
+            )
+        elif dtype == "bfloat16":
+            num_elems_to_backend_type = (
+                (8, "uint4"),
+                (4, "uint2"),
+                (2, "uint"),
+                (1, "bfloat16"),
+            )
+        elif dtype == "int64":
+            num_elems_to_backend_type = (
+                (2, "uint4"),
+                (1, "uint2"),
+            )
+        else:
             raise NotImplementedError("Unsupported dtype {}!".format(dtype))
-        for alignment, backend_type in num_elements_to_backend_type_list:
-            if num_elements % alignment == 0:
-                return backend_type
+
+        for mod, dtype in num_elems_to_backend_type:
+            if num_elements % mod == 0:
+                return dtype
+
         raise RuntimeError(
-            "Failed to infer data type! num_elements: {}, num_elements_to_backend_type_list: {}".format(
-                num_elements, num_elements_to_backend_type_list
-            )
+            f"Failed to infer data type due to invalid num elems to backend type mapping: {num_elems_to_backend_type}"
         )
 
     def get_candidate_op_types(self, op_t: str) -> List[str]:
@@ -260,7 +421,9 @@ class ROCMSpec(GPUBackendSpec):
     prefix = "hip"
     stream = "stream"
     cub = "hipcub"
+    warp_size = 64
 
+    cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
     cast_to_const_half_ptr_template = jinja2.Template(
         "reinterpret_cast<const half*>({{name}})"
@@ -268,7 +431,11 @@ class ROCMSpec(GPUBackendSpec):
     header_src_template = jinja2.Template(
         """
 #include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
 #include <hip/hip_runtime.h>
+
+using bfloat16 = hip_bfloat16;
+
 {{extra_header}}
         """
     )
@@ -277,6 +444,7 @@ class ROCMSpec(GPUBackendSpec):
     dtype_to_ck_type: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "ck::half_t",
+            "float32": "float",
             "float": "float",
         }
     )
@@ -292,7 +460,9 @@ class CUDASpec(GPUBackendSpec):
     prefix = "cuda"
     stream = "stream"
     cub = "cub"
+    warp_size = 32
 
+    cast_to_ptr_template = jinja2.Template("reinterpret_cast<{{dtype}}*>({{name}})")
     cast_to_half_ptr_template = jinja2.Template("reinterpret_cast<half*>({{name}})")
     cast_to_const_half_ptr_template = jinja2.Template(
         "reinterpret_cast<const half*>({{name}})"
@@ -300,6 +470,11 @@ class CUDASpec(GPUBackendSpec):
     header_src_template = jinja2.Template(
         """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+using bfloat16 = nv_bfloat16;
+using bfloat16_2 = nv_bfloat162;
+
 {{extra_header}}
         """
     )
@@ -308,6 +483,8 @@ class CUDASpec(GPUBackendSpec):
     dtype_to_cutlass_type: Dict[str, str] = field(
         default_factory=lambda: {
             "float16": "cutlass::half_t",
+            "bfloat16": "cutlass::bfloat16_t",
+            "float32": "float",
             "float": "float",
         }
     )
diff --git a/python/aitemplate/backend/build_cache.py b/python/aitemplate/backend/build_cache.py
new file mode 100644
index 000000000..5b19af1bc
--- /dev/null
+++ b/python/aitemplate/backend/build_cache.py
@@ -0,0 +1,44 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+The build_cache functionality is split into
+this file and build_cache_base.py
+
+This file is part of the AITemplate OSS distribution.
+For Meta-internal use, there can be an alternative
+to this file which allows to instantiate build caches
+with Meta-internal backing infrastructure.
+"""
+
+from aitemplate.backend.build_cache_base import (
+    BuildCache,
+    FileBasedBuildCache,
+    NoBuildCache,
+)
+from aitemplate.utils import environ as aitemplate_env
+
+__all__ = ["BUILD_CACHE", "BuildCache"]
+
+
+def create_build_cache() -> BuildCache:
+    build_cache_dir = aitemplate_env.ait_build_cache_dir()
+    if build_cache_dir is None or build_cache_dir == "":
+        return NoBuildCache()
+    else:
+        return FileBasedBuildCache(build_cache_dir)
+
+
+BUILD_CACHE: BuildCache = create_build_cache()
diff --git a/python/aitemplate/backend/build_cache_base.py b/python/aitemplate/backend/build_cache_base.py
new file mode 100644
index 000000000..a3b78d06a
--- /dev/null
+++ b/python/aitemplate/backend/build_cache_base.py
@@ -0,0 +1,594 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import hashlib
+import logging
+import os
+import random
+import re
+import secrets
+import shlex
+import shutil
+import tempfile
+
+from abc import ABC, abstractmethod
+from datetime import datetime, timedelta
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+from aitemplate.backend.target import Target
+
+from aitemplate.utils import environ as aitemplate_env
+
+from aitemplate.utils.io import file_age, touch
+
+_LOGGER = logging.getLogger(__name__)
+
+
+# File extensions to be considered source files
+source_extensions = {
+    "cpp",
+    "h",
+    "cu",
+    "cuh",
+    "c",
+    "hpp",
+    "hxx",
+    "inl",
+    "py",
+    "cxx",
+    "cc",
+    "version",
+    "binhash",
+    "hash",
+}
+
+source_filenames = {
+    # needs to be lowercase, because everything is lowercased before comparison
+    # Filenames in here are considered source files, even if their extension would
+    # suggest they are cache artifacts
+    "makefile"
+}
+
+source_filename_prefixes = ["makefile"]
+
+# File extensions of files to be considered cache artifacts ( unless they are considered source files )
+# note: we're not caching .obj files anymore as these are not strictly necessary to keep.
+cache_extensions = {"so", "dll", "exe", ""}
+
+skip_cache_flag = False  # Global flag that cache implementations should check whether
+# the cache is enabled or not. Used by skip_build_cache decorator
+
+
+class SkipBuildCache:
+    def __init__(self, context_skip_cache_flag: bool = True):
+        """
+        Context manager to temporarily disable the build cache within an execution context.
+        """
+        self.context_skip_cache_flag = context_skip_cache_flag
+
+    def __enter__(self):
+        global skip_cache_flag
+        self.old_skip_cache_flag = skip_cache_flag
+        skip_cache_flag = self.context_skip_cache_flag
+
+    def __exit__(self, *args, **kwargs):
+        global skip_cache_flag
+        skip_cache_flag = self.old_skip_cache_flag
+
+
+def should_skip_build_cache():
+    """
+    This function should be called by cache implementations to determine whether the cache should be skipped or not
+    """
+    global skip_cache_flag
+    if skip_cache_flag:
+        return True
+    skip_percentage = aitemplate_env.ait_build_cache_skip_percentage()
+    if skip_percentage is not None:
+        skip_percentage = int(skip_percentage)
+        assert (
+            skip_percentage >= 0 and skip_percentage <= 100
+        ), f"Skip percentage has to be in the range [0,100]. Actual value: {skip_percentage}"
+        if skip_percentage == 100:
+            return True
+        if skip_percentage == 0:
+            return False
+        rndi = random.randint(0, 99)
+        if rndi < skip_percentage:
+            return True
+    return False
+
+
+def filename_norm_split(filename: str) -> Tuple[str, str]:
+    """
+    Splits filename into basename and extension
+    and lowercases results to enable simple lookup
+    in a case-insensitive manner.
+
+    Args:
+        filename (str): Filename/Path to split
+
+    Returns:
+        Tuple[str,str]: file basename, file extension
+    """
+    file_basename = os.path.basename(filename).lower()
+    file_parts = file_basename.split(".")
+    if len(file_parts) > 1:
+        file_ext = file_parts[-1]
+    else:
+        file_ext = ""
+    return file_basename, file_ext
+
+
+def is_source(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a source file (used to build the cache key) for the purpose of caching
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a source file
+    """
+    file_basename, file_ext = filename_norm_split(filename)
+    return (
+        (file_basename in source_filenames)
+        or (file_ext in source_extensions)
+        or any(file_basename.startswith(p) for p in source_filename_prefixes)
+    )
+
+
+def is_cache_artifact(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a cacheable artifact (not used to build cache key, but stored in cache)
+    for the purpose of caching
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a cache artifact
+    """
+    file_basename, file_ext = filename_norm_split(filename)
+    return not is_source(filename) and file_ext in cache_extensions
+
+
+def is_bin_file(filename: str) -> bool:
+    """
+    Simple filter function, returns true if the passed filename is considered
+    to be a bin file which needs to be considered for the purpose of creating
+    a cache-key, but may be deleted after an initial build.
+
+    bin files are hashed, and their hashes are kept in a small separete file
+    for future use when building the cache key. So the hash is not lost, even if the binary
+    file is deleted.
+
+    Args:
+        filename (str): File path as a string
+
+    Returns:
+        bool: Whether the filename is a binary file in the above sense
+    """
+    return filename.lower().endswith(".bin")
+
+
+def create_dir_hash(
+    cmds: List[str],
+    build_dir: str,
+    filter_func: Callable[[str], bool] = is_source,
+    debug=False,
+    content_replacer: Callable[[str], Optional[bytes]] = None,
+) -> str:
+    """Create a hash of the (source file) contents of a build directory, used for
+    creating a cache key of an entire directory along with the build commands.
+
+    Args:
+        cmds (List[str]): Build commands to be incorporated in hash key computation
+        build_dir (str): Path to build directory ( not part of hash )
+        filter_func (Callable[[str], bool], optional): Filter function which determines whether a given file is considered a source file or not. Defaults to is_source(path).
+        debug (bool, optional): Whether to write a 'cache_key.log' file into the build directory, so that cache misses can be debugged more easily. Defaults to False.
+        content_replacer (Callable[[Path], Optional[bytes]], optional): Content replacer is an optional function that may replace content of a file for hashing purposes. If None, or if this function returns None,
+                                                                        then no content replacement is done   on the file.
+    Returns:
+        str: SHA256 Hash of the build directory contents in the form of a hexdigest string.
+    """
+    hash_log = None
+    try:
+        if not os.path.isdir(build_dir):
+            return "empty_dir"
+        if debug:
+            hash_log = open(  # noqa: P201 - this is actually closed properly in the finally close below
+                os.path.join(build_dir, "cache_key.log"), mode="a", encoding="utf8"
+            )
+            hash_log.write(f"Building dir hash of {build_dir}\n")
+        basepath = Path(build_dir)
+        files = [p.relative_to(basepath) for p in basepath.rglob("*") if not p.is_dir()]
+        hash_object = hashlib.sha256()
+        for cmd in cmds:
+            _cmd = cmd.replace(
+                build_dir, "${BUILD_DIR}"
+            )  # Make sure we can cache regardless of the build directory location.
+            hash_object.update(_cmd.encode("utf-8"))
+            if debug:
+                hash_log.write(f"\tCOMMAND: {_cmd} -> {hash_object.hexdigest()}\n")
+        for fpath in sorted(files):
+            if not filter_func(str(fpath)):
+                continue
+            hash_object.update(str(fpath).encode("utf-8"))
+            fullpath = str(basepath / fpath)
+            replaced_content = None
+            if content_replacer is not None:
+                replaced_content = content_replacer(fullpath)
+            if replaced_content is not None:
+                hash_object.update(replaced_content)
+            else:
+                with open(fullpath, "rb") as f:
+                    # read file in chunks of 32kb
+                    # in order to support large files ( constants.obj )
+                    while True:
+                        chunk = f.read(1024 * 32)
+                        if not chunk:
+                            break
+                        hash_object.update(chunk)
+            if debug:
+                hash_log.write(f"\t{str(fpath)} -> {hash_object.hexdigest()}\n")
+        if debug:
+            hash_log.write(
+                f"Final hash of {build_dir} is {hash_object.hexdigest().lower()}\n"
+            )
+        return hash_object.hexdigest().lower()
+    finally:
+        if hash_log:
+            hash_log.close()
+
+
+def write_binhash_file(
+    build_dir,
+    binhash_filename="constants.hash",
+    filter_func: Callable[[str], bool] = is_bin_file,
+):
+    """Hash all binary input files, so we don't have to keep them ( Usecase: constants.obj / constants.bin )
+
+    Args:
+        build_dir (str): Path to build directory
+        binhash_filename (str, optional): File to be written within build_dir, defaults to "constants.hash".
+        filter_func (Callable[[str], bool], optional): Filter function to determine which files to hash. Defaults to is_bin_file.
+    """
+    binhash = create_dir_hash([binhash_filename], build_dir, filter_func=filter_func)
+    with open(os.path.join(build_dir, binhash_filename), "w", encoding="utf-8") as f:
+        f.write(binhash)
+
+
+class BuildCache(ABC):
+    """
+    Abstract base class for build cache implementations
+    """
+
+    @abstractmethod
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        """
+        Retrieves the build cache artifacts for the given build directory,
+        so that ideally no compilation needs to take place.
+
+        Args:
+            cmds (_type_): Build commands, these will be part of the hash used to calculate a lookup key
+            build_dir (str): Build directory. The source files, Makefile and some other files will be hashed and used to
+                             determine the build cache key.
+            from_sources_filter_func (Callable[[str], bool], optional): Filter function, which may be used to determine which files are being considered source files. Defaults to is_source.
+
+        Returns:
+            Tuple[bool, Optional[str]]: A tuple indicating whether the build cache was successfully retrieved, and a cache key (which should be passed on to store_build_cache on rebuild )
+        """
+        ...
+
+    @abstractmethod
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        """
+        Store the build cache artifacts
+
+        Args:
+            cmds ( List[str]): Build commands, these will be part of the hash used to calculate a lookup key
+            build_dir (str): Path to build directory to retrieve build artifacts from
+            cache_key (str): Cache key, as returned from retrieve_build_cache
+            filter_func (Callable[[str], bool], optional): Filter function, which may be used to determine which files are being considered cacheable artifact files. Defaults to is_cache_artifact.
+
+        Returns:
+            bool: Whether the artifacts were successfully stored
+        """
+        ...
+
+    def maybe_cleanup(
+        self, lru_retention_hours: int = 72, cleanup_max_age_seconds: int = 3600
+    ):
+        """
+        Maybe clean up the build cache if its been longer than `cleanup_max_age_seconds` that it has been cleaned up
+
+        Args:
+            lru_retention_hours (int, optional): How many hours should unused elements be retained in the cache? Defaults to 72.
+            cleanup_max_age_seconds (int, optional): Cleanup interval in seconds. Defaults to 3600.
+        """
+        pass
+
+    def cleanup(self, retention_hours: int = 72):
+        """Do a cache cleanup.
+
+        Args:
+            retention_hours (int, optional): How many hours should unused elements be retained in the cache? Defaults to 72.
+        """
+        pass
+
+    def makefile_normalizer(
+        self, path, memoize_replacements=True, debug=False
+    ) -> Optional[bytes]:
+        """
+        Normalizes the content of the makefile for hashing purposes (nothing else!),
+        so that it can be compared to other Makefiles
+        generated by different users on different systems.
+        """
+        p = Path(path)
+        if not p.name.lower().startswith("makefile"):
+            return None
+        makefile_content_orig = p.read_bytes()
+        target: Target = None
+        try:
+            target = Target.current()
+        except RuntimeError:
+            # No current target, returning Makefile content unchanged
+            return makefile_content_orig
+        if target is None:
+            return makefile_content_orig
+        if not hasattr(target, "_compile_options"):  #
+            return makefile_content_orig
+        if not hasattr(self, "_include_path_hash_cache"):
+            self._include_path_hash_cache = {}
+        makefile_content = makefile_content_orig.decode("utf-8")
+        compile_options = list(shlex.split(target._compile_options))
+        tmpdir = tempfile.gettempdir()
+        replacements = {}
+        for i in range(len(compile_options)):
+            if compile_options[i] == "-I":
+                if i < len(compile_options) - 1:
+                    inc_path = compile_options[i + 1]
+
+            elif compile_options[i].startswith("-I"):
+                inc_path = compile_options[i][2:]
+            else:
+                continue
+            # We are creating hashes of all include directories in a temp dir
+            if inc_path.startswith(tmpdir):
+                if memoize_replacements and inc_path in self._include_path_hash_cache:
+                    inc_path_hash = self._include_path_hash_cache[inc_path]
+                else:
+                    inc_path_hash = create_dir_hash([], inc_path, is_source)
+                    if memoize_replacements:
+                        self._include_path_hash_cache[inc_path] = inc_path_hash
+                replacements[inc_path] = inc_path_hash
+
+        for search, replace in replacements.items():
+            makefile_content = makefile_content.replace(search, replace)
+        makefile_content = re.sub(
+            r"[^/\\]+[/\\]fb_include", "fb_include", makefile_content
+        )
+        makefile_bytes = makefile_content.encode("utf-8")
+        if debug:
+            (p.parent / (p.name + ".normalized")).write_bytes(makefile_bytes)
+        return makefile_bytes
+
+
+class NoBuildCache(BuildCache):
+    def __init__(self):
+        """
+        Dummy build cache implementation which does nothing.
+
+        For method docstrings, see parent class.
+        """
+        _LOGGER.info("Build cache disabled")
+
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        return False, None
+
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        pass
+
+
+class FileBasedBuildCache(BuildCache):
+    def __init__(
+        self,
+        cache_dir,
+        lru_retention_hours=72,
+        cleanup_max_age_seconds=3600,
+        debug=True,
+    ):
+        """Filesystem based build cache.
+
+        For method docstrings, see parent class.
+
+        Args:
+            cache_dir (str): Path to store cache data below. Should be an empty, temporary directory with enough space to hold the cache contents. Will be written to and deleted in!
+            lru_retention_hours (int, optional): Retention time for *unused* cache entries. Defaults to 72.
+            cleanup_max_age_seconds (int, optional): Minimum time between cache cleanups in seconds. After this time, a new cleanup gets triggered on next cache retrieval. Defaults to 3600.
+            debug (bool, optional): Whether to enable debugging cache key creation ( see debug parameter of create_dir_hash). Defaults to True. May be left at True, as it is usually helpful and  does not hurt performance.
+        """
+        self.cache_dir = cache_dir
+        self.lru_retention_hours = lru_retention_hours
+        self.cleanup_max_age_seconds = cleanup_max_age_seconds
+        self.debug = debug
+        _LOGGER.info(
+            f"Using file-based build cache, cache directory = {self.cache_dir}"
+        )
+
+    def retrieve_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        from_sources_filter_func: Callable[[str], bool] = is_source,
+    ) -> Tuple[bool, Optional[str]]:
+        """See docstring of implemented method interface in parent class"""
+        if should_skip_build_cache():
+            _LOGGER.info(f"CACHE: Skipped build cache for {build_dir}")
+            return False, None
+        self.maybe_cleanup(self.lru_retention_hours, self.cleanup_max_age_seconds)
+        cache_dir = self.cache_dir
+        dir_hash = create_dir_hash(
+            cmds,
+            build_dir,
+            filter_func=from_sources_filter_func,
+            debug=self.debug,
+            content_replacer=lambda path: self.makefile_normalizer(
+                path, memoize_replacements=True
+            ),
+        )
+        key_cache_dir = os.path.join(cache_dir, dir_hash)
+        if os.path.exists(key_cache_dir):
+            _LOGGER.info(f"CACHE: Using cached build results for {build_dir}")
+            target_basepath = Path(build_dir)
+            src_basepath = Path(key_cache_dir)
+            copy_files = [
+                p.relative_to(src_basepath)
+                for p in src_basepath.rglob("*")
+                if not p.is_dir()
+            ]
+            for filepath in copy_files:
+                target_path = target_basepath / filepath
+                target_parent = target_path.parent
+                src_path = src_basepath / filepath
+                if target_parent != target_basepath:
+                    os.makedirs(str(target_parent), exist_ok=True)
+                shutil.copy(
+                    str(src_path),
+                    str(target_path),
+                    follow_symlinks=True,
+                )  # Using shutil.copy intentionally instead of copy2, so the file modification time is updated, and file owner
+                # is not copied. When you retrieve the file from cache, it is yours.
+                _LOGGER.debug(f"CACHE: retrieved {filepath}")
+            # make sure the last modified timestamp is updated, so we can
+            # evict cache directories which are too old using a separate script
+            os.utime(key_cache_dir)
+            return True, dir_hash
+        _LOGGER.info(f"CACHE: No results found for {build_dir}")
+        return False, dir_hash
+
+    def store_build_cache(
+        self,
+        cmds: List[str],
+        build_dir: str,
+        cache_key: str,
+        filter_func: Callable[[str], bool] = is_cache_artifact,
+    ) -> bool:
+        """See docstring of implemented method interface in parent class"""
+        cache_dir = self.cache_dir
+        key_cache_dir = os.path.join(cache_dir, cache_key)
+
+        # We create a temporary directory first, so we can do an
+        # atomic update later to prevent race conditions
+        # in a distributed / parallel build setting
+        random_str = secrets.token_hex(16)
+
+        # the temp_cache_dir will be renamed to key_cache_dir
+        # atomically later. It needs to be on same file system
+        # for atomic rename, so we put it into the same folder.
+        temp_cache_dir = key_cache_dir + f".{random_str}.tmp"
+        try:
+            os.makedirs(temp_cache_dir, exist_ok=False)
+        except OSError:
+            _LOGGER.warn(
+                f"CACHE: Failed to create tempdir {temp_cache_dir}. Cannot write cache entries."
+            )
+            return False
+        basepath = Path(build_dir)
+        target_basepath = Path(temp_cache_dir)
+        copy_files = [
+            p.relative_to(basepath) for p in basepath.rglob("*") if not p.is_dir()
+        ]
+        for filepath in copy_files:
+            src_path = basepath / filepath
+            if not filter_func(str(filepath)):
+                continue
+
+            target_path = target_basepath / filepath
+            target_parent = target_path.parent
+            if target_parent != target_basepath:
+                os.makedirs(str(target_parent), exist_ok=True)
+            shutil.copy2(
+                str(src_path),
+                str(target_path),
+                follow_symlinks=True,
+            )  # Use copy2, so the file metadata (incl. last modified time) is preserved
+            _LOGGER.info(f"CACHE: storing {filepath} into {key_cache_dir}: ")
+        try:
+            os.rename(
+                temp_cache_dir, key_cache_dir
+            )  # Atomic update to prevent race condition
+            return True
+        except OSError:
+            _LOGGER.info(
+                f"CACHE: update race conflict - {key_cache_dir} already exists. (Note: No error! This can be expected to happen occasionally.))"
+            )
+            shutil.rmtree(temp_cache_dir, ignore_errors=True)
+            return False
+
+    def maybe_cleanup(
+        self, lru_retention_hours: int = 72, cleanup_max_age_seconds: int = 3600
+    ):
+        """See docstring of implemented method interface in parent class"""
+        last_cleaned_seconds = file_age(os.path.join(self.cache_dir, ".last_cleaned"))
+        if last_cleaned_seconds > cleanup_max_age_seconds:
+            self.cleanup(lru_retention_hours)
+
+    def cleanup(self, lru_retention_hours: int = 72):
+        """See docstring of implemented method interface in parent class"""
+        _LOGGER.info(
+            f"CACHE: Cleaning up build cache below {self.cache_dir}. Folders last used more than {lru_retention_hours} hours ago will be deleted."
+        )
+        touch(os.path.join(self.cache_dir, ".last_cleaned"))
+        if os.path.isdir(self.cache_dir):
+            now = datetime.now()
+            age_limit = timedelta(hours=lru_retention_hours)
+
+            for dirpath in os.scandir(self.cache_dir):
+                if os.path.isdir(dirpath):
+                    # Get the modification time of the directory and convert it to a datetime object
+                    mtime = os.path.getmtime(dirpath)
+                    modification_time = datetime.fromtimestamp(mtime)
+
+                    # Check if the directory is older than N hours
+                    if now - modification_time > age_limit:
+                        _LOGGER.info(f"CACHE: Deleting {dirpath}")
+                        shutil.rmtree(dirpath)
diff --git a/python/aitemplate/backend/builder.py b/python/aitemplate/backend/builder.py
index ff23afda4..8610511ef 100644
--- a/python/aitemplate/backend/builder.py
+++ b/python/aitemplate/backend/builder.py
@@ -18,47 +18,172 @@
 
 from __future__ import annotations
 
+import logging
 import multiprocessing
-
 import os
-import pathlib
 import re
 import shlex
 import subprocess
-import typing
-from typing import Optional
+from hashlib import sha1
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
 
 import jinja2
 
-from ..utils import logger
-from .target import Target
-from .task_runner import BaseRunner, Task
+from aitemplate.backend import build_cache
+from aitemplate.backend.build_cache_base import write_binhash_file
+
+from aitemplate.backend.target import Target
+from aitemplate.backend.task_runner import BaseRunner, Task
+
+from aitemplate.utils import environ
+
+from aitemplate.utils.debug_settings import AITDebugSettings
+
+from aitemplate.utils.environ import is_cmake_compilation
+from aitemplate.utils.misc import is_debug, is_windows
 
 # pylint: disable=W0221,C0103
 
 
-def _run_make_cmds(cmds, timeout):
-    logger.debug(__name__, f"make {cmds=}")
-    proc = subprocess.Popen(
-        [" && ".join(cmds)],
-        shell=True,
-        env=os.environ.copy(),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
+_LOGGER = logging.getLogger(__name__)
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
+def _augment_for_trace(cmd):
+    return (
+        'date +"{{\\"name\\": \\"$@\\", \\"ph\\": \\"B\\", \\"pid\\": \\"$$$$\\", \\"ts\\": \\"%s%6N\\"}},";'
+        " {}; "
+        'date +"{{\\"name\\": \\"$@\\", \\"ph\\": \\"E\\", \\"pid\\": \\"$$$$\\", \\"ts\\": \\"%s%6N\\"}},";'
+    ).format(cmd)
+
+
+def _time_cmd(cmd):
+    return (
+        f"exec time -f 'exit_status=%x elapsed_sec=%e argv=\"%C\"' {cmd}"
+        if environ.time_compilation()
+        else cmd
     )
-    try:
-        out, err = proc.communicate(timeout)
-    except subprocess.TimeoutExpired as e:
-        proc.kill()
-        out, err = proc.communicate()
-        raise e
-    finally:
-        if proc.returncode != 0:
-            # Let's always print out more info upon any failures.
-            logger_f = logger.info
-        else:
-            logger_f = logger.debug
-        logger_f(__name__, f"make stdout: {out.decode()}\nmake stderr: {err.decode()}")
+
+
+def _log_error_context(
+    stderr,
+    build_dir,
+    context_radius=10,
+    max_errors_per_file=5,
+    padding=5,
+):
+    path_to_error_lines = {}
+    for line in [L for L in stderr.split("\n") if ": error:" in L]:
+        match = re.search(r"(.+)\((\d+)\): error:.*", line)
+        if match:
+            path = match[1]
+            error_line = match[2]
+            if path not in path_to_error_lines:
+                path_to_error_lines[path] = set()
+            # nvcc line numbers are 1-based
+            error_line = int(error_line) - 1
+            path_to_error_lines[path].add(error_line)
+
+    # keep only the first N error lines per file
+    path_to_error_lines = {
+        path: sorted(error_lines)[:max_errors_per_file]
+        for path, error_lines in path_to_error_lines.items()
+    }
+
+    path_to_visible_lines = {}
+    for path, error_lines in path_to_error_lines.items():
+        path_to_visible_lines[path] = set()
+        for error_line in error_lines:
+            # collect the context lines around each error line
+            context = range(
+                error_line - context_radius,
+                error_line + context_radius + 1,
+            )
+            path_to_visible_lines[path].update(list(context))
+
+    for path, visible_lines in path_to_visible_lines.items():
+        full_path = os.path.join(build_dir, path)
+        if os.path.exists(full_path):
+            # read the lines from the file
+            with open(full_path, "r") as f:
+                # each line ends with '\n'
+                file_lines = f.readlines()
+            # except maybe the last line
+            if file_lines and not file_lines[-1].endswith("\n"):
+                file_lines[-1] = f"{file_lines[-1]}\n"
+            num_file_lines = len(file_lines)
+
+            error_lines = path_to_error_lines[path]
+            visible_lines = sorted(visible_lines)
+
+            lines_to_show = []
+            last_printed_i = -1
+            for i in visible_lines:
+                if i < 0 or i >= num_file_lines:
+                    # skip the line number as extraneous
+                    continue
+                if i - last_printed_i > 1:
+                    # preceding ellipsis
+                    lines_to_show.append("...\n")
+                line = file_lines[i]
+                lines_to_show.append(f"{i+1:<{padding}} {line}")
+                if i in error_lines:
+                    # mark the line as an error line: underscore
+                    spaces = line[: len(line) - len(line.lstrip())]
+                    underscore = spaces + "^" * (len(line) - len(spaces) - 1)
+                    lines_to_show.append(f"{' ' * padding} {underscore}\n")
+                last_printed_i = i
+            if visible_lines[-1] < num_file_lines - 1:
+                # closing ellipsis
+                lines_to_show.append("...\n")
+
+            if lines_to_show:
+                # all lines_to_show end with '\n'
+                summary = "".join(lines_to_show)
+                _LOGGER.info(f"{path}:\n\n{summary}")
+
+
+def _run_make_cmds(cmds, timeout, build_dir, allow_cache=True):
+    _LOGGER.debug(f"make {cmds=}")
+    if allow_cache:
+        (
+            cached_results_available,
+            store_cache_key,
+        ) = build_cache.BUILD_CACHE.retrieve_build_cache(cmds, build_dir)
+    else:
+        cached_results_available, store_cache_key = False, None
+    if not cached_results_available:
+        proc = subprocess.Popen(  # noqa: P204
+            [" && ".join(cmds)],
+            shell=True,
+            env=os.environ.copy(),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        try:
+            out, err = proc.communicate(timeout)
+            if proc.returncode == 0 and store_cache_key is not None:
+                build_cache.BUILD_CACHE.store_build_cache(
+                    cmds, build_dir, store_cache_key
+                )
+        except subprocess.TimeoutExpired as e:
+            proc.kill()
+            out, err = proc.communicate()
+            raise e
+        finally:
+            stdout = out.decode()
+            stderr = err.decode()
+            if proc.returncode != 0:
+                _LOGGER.info(f"make stdout:\n\n{stdout}")
+                _LOGGER.info(f"make stderr:\n\n{stderr}")
+
+                _log_error_context(stderr, build_dir)
+
+                raise RuntimeError("Build has failed.")
+            else:
+                _LOGGER.debug(f"make stdout:\n\n{stdout}")
+                _LOGGER.debug(f"make stderr:\n\n{stderr}")
 
 
 def process_task(task: Task) -> None:
@@ -75,16 +200,14 @@ def process_task(task: Task) -> None:
     stderr = task._stderr
     if task._proc.returncode != 0:
         task._failed = True
-        logger.info(
-            __name__,
+        _LOGGER.info(
             "Failed: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
                 name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
             ),
         )
         task._ret = -1
     else:
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "Successful: [{name}]\ncmd:\n{cmd}\nstderr:\n{stderr}\nstdout:{stdout}".format(
                 name=task._name, cmd=task._cmd, stderr=stderr, stdout=stdout
             ),
@@ -115,25 +238,24 @@ class Runner(BaseRunner):
     Runner is inherited from BaseRunner.
     """
 
-    def __init__(self, devs: list[int], timeout: int = 10):
+    def __init__(self, devs: List[int], timeout: int = 10):
         """Initialize a parallel runner for building
 
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             CPU ids for compiling
         timeout : int, optional
             Compiling timeout, by default 10 (seconds)
         """
         super().__init__(devs, "builder", timeout)
-        logger.info(
-            __name__,
+        _LOGGER.info(
             "Using {n} CPU for building".format(n=devs),
         )
         self._ftask_proc = process_task
         self._fret_proc = process_return
 
-    def push(self, idx: typing.Union[int, str], cmd: str, target: Target) -> None:
+    def push(self, idx: Union[int, str], cmd: str, target: Target) -> None:
         """Push a building task into runner
 
         Parameters
@@ -147,7 +269,7 @@ def push(self, idx: typing.Union[int, str], cmd: str, target: Target) -> None:
         """
         self._queue.append(Task(idx, cmd, target, shell=True))
 
-    def pull(self) -> list[None]:
+    def pull(self) -> List:
         """Pull building results.
         Check whether all building tasks are successful.
 
@@ -160,7 +282,7 @@ def pull(self) -> list[None]:
         return ret
 
 
-class Builder(object):
+class Builder:
     """Builder is a module to compile generated source code
     files into binary objects.
     """
@@ -184,10 +306,11 @@ def __init__(self, n_jobs: int = -1, timeout: int = 180) -> None:
         self._runner = Runner(n_jobs, timeout)
         self._n_jobs = n_jobs
         self._timeout = timeout
+        self._do_trace = os.environ.get("AIT_TRACE_MAKE", False)
 
     def build_objs(
         self,
-        files: list[typing.Tuple[str, str]],
+        files: List[Tuple[str, str]],
         cc_cmd: str,
         binary_cc_cmd: Optional[str] = None,
     ):
@@ -195,7 +318,7 @@ def build_objs(
 
         Parameters
         ----------
-        files : list[Tuple[str, str]]
+        files : List[Tuple[str, str]]
             list of tuples of source code path and object file path
         cc_cmd : str
             command line template for building objects
@@ -206,15 +329,15 @@ def build_objs(
         """
         for idx, fpair in enumerate(files):
             src, target = fpair
-            logger.info(__name__, "Building " + target)
+            _LOGGER.info("Building " + target)
             if src.endswith(".bin"):
                 if binary_cc_cmd is None:
                     raise ValueError(
                         "Cannot compile .bin file without specifying binary_cc_cmd!"
                     )
 
-                src_path = pathlib.Path(src)
-                target_path = pathlib.Path(target)
+                src_path = Path(src)
+                target_path = Path(target)
                 compile_cmd = binary_cc_cmd.format(
                     target=target_path.name, src=src_path.name
                 )
@@ -222,7 +345,7 @@ def build_objs(
                 # Have to cd into the containing dir so ld doesn't include
                 # the path in the symbol names; unfortunately, there's no other
                 # way to control this.
-                if logger.is_debug():
+                if is_debug():
                     cmd = f"cd {containing_dir} && {compile_cmd} && cd -"
                 else:
                     # If not in debug mode, remove the original .bin file which can potentially be quite large.
@@ -230,22 +353,23 @@ def build_objs(
             else:
                 cmd = cc_cmd.format(target=target, src=src)
 
-            logger.debug(__name__, f"The cmd for building {target} is : {cmd}")
+            cmd = _time_cmd(cmd)
+            _LOGGER.debug(f"The cmd for building {target} is : {cmd}")
             self._runner.push(idx, cmd, target)
         self._runner.join()
         self._runner.pull()
 
-    def build_so(self, target: Target, objs: list[str]):
+    def build_so(self, target: Target, objs: List[str]):
         """Generate a task to build all objects into a dynamic library
 
         Parameters
         ----------
         target : Target
             Device target of dynamic library
-        objs : list[str]
+        objs : List[str]
             List of all object file paths for building the dynamic library.
         """
-        logger.info(__name__, "Building " + target)
+        _LOGGER.info("Building " + target)
         cc = Target.current().cc()
         compile_options = Target.current().compile_options()
         fpic = "-fPIC"
@@ -258,13 +382,13 @@ def build_so(self, target: Target, objs: list[str]):
             + compile_options
             + " -o {target} {objs}".format(target=target, objs=" ".join(objs))
         )
-        logger.debug(__name__, f"The cmd for building {target} is {cmd}")
+        cmd = _time_cmd(cmd)
+        _LOGGER.debug(f"The cmd for building {target} is {cmd}")
         self._runner.push(0, cmd, target)
         self._runner.join()
         self._runner.pull()
 
-    def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
-
+    def gen_makefile(self, file_pairs, dll_name, workdir, test_name, debug_settings):
         makefile_template = jinja2.Template(
             """
 CC = {{cc}}
@@ -279,20 +403,46 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
     {{bfile_cmd}}
 
 .PHONY: all clean clean_constants
-all: {{target}}
+all: {{targets}}
+
+{{dll_target}}: $(obj_files)
+    {{build_so_cmd}}
 
-{{target}}: $(obj_files)
-    $(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)
+{{build_standalone_rules}}
 
 clean:
-    rm -f *.obj {{target}} test.so
+    rm -f *.obj {{targets}}
 
 clean_constants:
     rm -f constants.bin
 """
         )
 
-        obj_files = [pair[1].split("/")[-1] for pair in file_pairs]
+        standalone_rules_template = jinja2.Template(
+            """
+{{standalone_src}}: {{standalone_obj}}
+    {{cfile_cmd}}
+
+{{exe_target}}: {{exe_target_deps}}
+    {{build_exe_cmd}}
+"""
+        )
+
+        build_so_cmd = "$(CC) -shared $(fPIC_flag) $(CFLAGS) -o $@ $(obj_files)"
+        standalone_src = "standalone.cu"
+        standalone_obj = "standalone.obj"
+        windll_obj = "windll.obj"
+        obj_files = []
+        # * standalone.cu is an AITemplate internal file that is used for generating
+        #   standalone executables. We only want to compile it when the relevant
+        #   debug option is enabled.
+        # * windll.cu and windll.obj are used in builder_cmake.py for MSVC compiler
+        #   and are not needed to be used in builder_make.py compiler engine.
+        obj_files = [
+            pair[1].split("/")[-1]
+            for pair in file_pairs
+            if not pair[1].endswith(standalone_obj) and not pair[1].endswith(windll_obj)
+        ]
         obj_files = " ".join(obj_files)
 
         cc = Target.current().cc()
@@ -304,20 +454,53 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
 
         cfile_cmd = Target.current().compile_cmd(False).format(target="$@", src="$<")
         bfile_cmd = Target.current().binary_compile_cmd()
+
         if not bfile_cmd:
             bfile_cmd = ""
         else:
             bfile_cmd = bfile_cmd.format(target="$@", src="$<")
 
+        if self._do_trace:
+            cfile_cmd = _augment_for_trace(cfile_cmd)
+            bfile_cmd = _augment_for_trace(bfile_cmd)
+            build_so_cmd = _augment_for_trace(build_so_cmd)
+        else:
+            cfile_cmd = _time_cmd(cfile_cmd)
+            bfile_cmd = _time_cmd(bfile_cmd)
+            build_so_cmd = _time_cmd(build_so_cmd)
+
+        build_exe_cmd = _time_cmd("$(CC) $(CFLAGS) -o $@ $(obj_files)")
+        targets = f"{dll_name}"
+
+        build_standalone_rules = ""
+        if debug_settings.gen_standalone:
+            build_exe_cmd = f"$(CC) $(CFLAGS) -o $@ {standalone_obj} {dll_name}"
+            exe_name = os.path.splitext(dll_name)[0]
+            if is_windows():
+                exe_name += ".exe"
+            exe_target_deps = f"{dll_name} {standalone_obj}"
+            build_standalone_rules = standalone_rules_template.render(
+                standalone_src=standalone_src,
+                standalone_obj=standalone_obj,
+                cfile_cmd=cfile_cmd,
+                exe_target=exe_name,
+                exe_target_deps=exe_target_deps,
+                build_exe_cmd=build_exe_cmd,
+            )
+            targets += f" {exe_name}"
+
         makefile_str = makefile_template.render(
             cc=cc,
             cpp=cpp,
             CFLAGS=compile_options,
             fPIC=fpic,
             obj_files=obj_files,
-            target=dll_name,
+            dll_target=dll_name,
+            targets=targets,
             cfile_cmd=cfile_cmd,
             bfile_cmd=bfile_cmd,
+            build_so_cmd=build_so_cmd,
+            build_standalone_rules=build_standalone_rules,
         )
 
         dumpfile = os.path.join(workdir, test_name, "Makefile")
@@ -325,50 +508,322 @@ def gen_makefile(self, file_pairs, dll_name, workdir, test_name):
             # fix the makefile indentation
             f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
 
+    def postprocess_build_dir(self, workdir: str, test_name: str) -> None:
+        build_dir = os.path.join(workdir, test_name)
+        current_target = None
+        try:
+            current_target: Target = Target.current()
+        except RuntimeError:
+            pass
+        if current_target is not None:
+            current_target.postprocess_build_dir(build_dir)
+
+    @staticmethod
+    def _combine_profiler_multi_sources():
+        """Whether to combine multiple profiler sources per target."""
+        return bool(int(os.environ.get("COMBINE_PROFILER_MULTI_SOURCES", 1)))
+
+    @staticmethod
+    def _force_one_profiler_source_per_target():
+        """Whether to combine multiple profiler sources per target into one."""
+        return bool(int(os.environ.get("FORCE_ONE_PROFILER_SOURCE_PER_TARGET", 0)))
+
+    def _combine_sources(self, sources):
+        """
+        Combine multiple source files (given by path) into one
+        source file and return the path of the combined file.
+
+        Parameters
+        ----------
+        sources : Iterable[str]
+            The list of paths to the source files to combine.
+
+        Returns
+        -------
+        path : str
+            The path to the combined source file.
+        """
+        assert len(sources) > 0, "Must have at least one source"
+        if len(sources) == 1:
+            # no need to combine a single source
+            return next(iter(sources))
+
+        file_lines = []
+        for source in sources:
+            with open(source, "r") as f:
+                lines = f.readlines()
+            for line in lines:
+                if line.strip():
+                    # collect the original non-empty lines
+                    file_lines.append(line)
+            # the last line might not end with "\n"
+            file_lines.append("\n")
+
+        # generate a new file name conditioned on the list of the source file names
+        file_name = sha1((";".join(sorted(sources))).encode("utf-8")).hexdigest()
+        file_dir = Path(next(iter(sources))).parents[0]  # fetch the directory
+        file_path = file_dir / Path(f"temp_{file_name}.cu")
+        with open(file_path, "w") as f:
+            # file_lines end with "\n" already
+            f.write("".join(file_lines))
+
+        # return the path starting with "./"
+        return os.path.join(".", str(file_path))
+
+    def _combine_profiler_sources(self, target_to_sources, num_builders):
+        """
+        Combine multiple profiler sources generated for different targets
+        to optimize the overall compilation time, given the available number
+        of builders (CPUs). The total number of sources (across all targets)
+        is set equal to the `num_builders`. Single-source targets are kept
+        as is; multi-source targetss' sources are possibly combined.
+
+        Simplifying assumptions:
+
+            - Individual split (multiple) sources per target take
+              approximately equal time to compile across different
+              targets (this is, in particular, not true for the main
+              profiler source file vs kernel-specific source files:
+              the former is typically larger than the latter);
+            - Compilation time grows linearly in the number of
+              separate sources combined into a single file.
+
+        Parameters
+        ----------
+        target_to_soruces : dict[str, Iterable[str]]
+            The mapping from each target name to the list of sources
+            required to compile this target. There can be one or more
+            sources for each target.
+        num_builders : int
+            The number of available builders (CPUs).
+
+        Returns
+        ----------
+        target_to_combined_sources : dict[str, Iterable[str]]
+            Like `target_to_sources`, but with some of the source paths
+            in the values replaced by the paths to the respective combined
+            source files. Whether and which of the sources are combined
+            depends on the arguments.
+        """
+        num_total_sources = num_builders
+
+        if (
+            len(target_to_sources) >= num_total_sources
+            or self._force_one_profiler_source_per_target()
+        ):
+            # there are at least as many targets as the total
+            # number of sources required (or single source per
+            # target is forced): combine everything
+            return {
+                target: [self._combine_sources(sources)]
+                for target, sources in target_to_sources.items()
+            }
+
+        combine_candidates = {}  # multi-source targets
+        num_multi_sources, num_single_sources = 0, 0
+        for target, sources in target_to_sources.items():
+            if len(sources) > 1:
+                combine_candidates[target] = sources
+                num_multi_sources += len(sources)
+            else:
+                num_single_sources += 1
+
+        if num_multi_sources == 0:
+            # all targets are single-source: nothing to combine
+            return target_to_sources
+        if num_multi_sources + num_single_sources <= num_total_sources:
+            # there are fewer source files than the total
+            # number of sources required: no need to combine
+            return target_to_sources
+
+        # number of sources we need for the multi-file targets
+        num_combined_sources = num_total_sources - num_single_sources
+        num_sources_per_target = {
+            # the number of combined sources per multi-source target as a
+            # fraction of num_combined_sources is proportional to the number of
+            # multiple sources of the target (rounded down); ultimately, there
+            # should be at least one source target (hence max(..., 1))
+            target: max(int(len(sources) / num_multi_sources * num_combined_sources), 1)
+            for target, sources in combine_candidates.items()
+        }
+
+        # do any sources remain after the above per-target distribution?
+        remaining_sources = num_combined_sources - sum(num_sources_per_target.values())
+        if remaining_sources > 0:
+            # reverse-sort the targets by the remainder after rounding down:
+            # prefer adding sources to the targets with a higher remainder
+            # (i.e. the ones closest to getting another source)
+            targets = sorted(
+                num_sources_per_target.keys(),
+                key=lambda target: (
+                    (
+                        len(target_to_sources[target])
+                        / num_multi_sources
+                        * num_combined_sources
+                    )
+                    - int(
+                        len(target_to_sources[target])
+                        / num_multi_sources
+                        * num_combined_sources
+                    )
+                ),
+                reverse=True,
+            )
+            target_id = 0
+            while remaining_sources > 0:
+                # increment the number of sources for the target
+                num_sources_per_target[targets[target_id]] += 1
+                target_id = (target_id + 1) % len(targets)
+                remaining_sources -= 1
+
+        result = {}
+        for target in target_to_sources:
+            if target in combine_candidates:
+                # collect the sources of the target
+                # in N batches by round robin
+                num_sources = num_sources_per_target[target]
+                # TODO: form the source batches by the total number
+                # of lines instead of the number of sources for more
+                # even distribution of the compilation time per batch
+                batch_id = 0
+                batches = [[] for _ in range(num_sources)]
+                for source in target_to_sources[target]:
+                    batches[batch_id].append(source)
+                    batch_id = (batch_id + 1) % num_sources
+                # conbine the sources in each batch
+                result[target] = [self._combine_sources(b) for b in batches]
+            else:
+                # use the single-source profiler target as is
+                result[target] = target_to_sources[target]
+        return result
+
     def _gen_makefile_for_profilers(self, file_pairs, profiler_dir):
         makefile_template = jinja2.Template(
             """
-programs = {{programs}}
-all: $(programs)
+all: {{targets}}
 .PHONY: all clean
 
-$(programs): %: %.{{cpp}}
-    {{cc_cmd}}
+{{commands}}
 
 clean:
-    rm -f $(programs)
+\trm -f {{targets}}
 """
         )
-        program_relative_paths = sorted(
-            {f[1].split(os.path.join(profiler_dir, ""))[-1] for f in file_pairs}
-        )
-        logger.info(__name__, f"compiling {len(program_relative_paths)} profiler srcs")
-        programs = " ".join(program_relative_paths)
-        cc = Target.current().cc()
-        cpp = "cpp"
-        if "nvcc" in cc:
-            cpp = "cu"
-        cc_cmd = Target.current().compile_cmd(True).format(target="$@", src="$<")
+        # normalize the profiler dir: add / at the end
+        profiler_dir = os.path.join(profiler_dir, "")
+
+        # deduplicate targets from different ops
+        target_to_sources = {}
+        for source, target in file_pairs:
+            if target not in target_to_sources:
+                target_to_sources[target] = set()
+            if isinstance(source, str):
+                target_to_sources[target].add(source)
+            else:
+                target_to_sources[target].update(source)
+
+        # stabilize the order of sources per target
+        target_to_sources = {
+            target: sorted(sources) for target, sources in target_to_sources.items()
+        }
+
+        if self._combine_profiler_multi_sources():
+            num_sources_before = sum(len(s) for s in target_to_sources.values())
+            target_to_sources = self._combine_profiler_sources(
+                target_to_sources=target_to_sources,
+                num_builders=self._n_jobs,
+            )
+            num_sources_after = sum(len(s) for s in target_to_sources.values())
+
+            _LOGGER.info(
+                f"combined {num_sources_before} profiler sources into {num_sources_after}",
+            )
+
+        targets = []
+        dependencies = {}
+        for target, sources in target_to_sources.items():
+            target = target.split(profiler_dir)[-1]
+            if len(sources) == 1:
+                # single-source profiler executable
+                source = next(iter(sources))
+                source = source.split(profiler_dir)[-1]
+                dependencies[target] = [source]
+            else:
+                # multi-source profiler executable
+                objects = []
+                for source in sources:
+                    # first compile the objects
+                    source = source.split(profiler_dir)[-1]
+                    obj = source.replace(".cu", ".obj")
+                    if not os.path.exists(os.path.join(profiler_dir, obj)):
+                        # compile the object only if it is absent
+                        dependencies[obj] = [source]
+                    objects.append(obj)
+                # then link the objects into an executable
+                dependencies[target] = objects
+            targets.append(target)
+
+        commands = []
+        num_compiled_sources = 0
+        target_names = set()
+        for target, srcs in dependencies.items():
+            # for each "target: srcs" pair,
+            # generate two lines for the Makefile
+            src_list = " ".join(srcs)
+            dep_line = f"{target}: {src_list}"
+            cmd_line = (
+                Target.current()
+                .compile_cmd(executable=(not target.endswith(".obj")))
+                .format(target=target, src=src_list)
+            )
+            if self._do_trace:
+                cmd_line = _augment_for_trace(cmd_line)
+            else:
+                cmd_line = _time_cmd(cmd_line)
+
+            command = f"{dep_line}\n\t{cmd_line}\n"
+            commands.append(command)
+
+            # update compilation statistics
+            num_compiled_sources += sum(1 for s in srcs if s.endswith(".cu"))
+            if not target.endswith(".obj"):
+                target_names.add(os.path.split(target)[-1])
+
+        _LOGGER.info(f"compiling {num_compiled_sources} profiler sources")
+        _LOGGER.info(f"linking {len(target_names)} profiler executables")
+
         makefile_str = makefile_template.render(
-            cpp=cpp,
-            programs=programs,
-            cc_cmd=cc_cmd,
+            targets=" ".join(set(targets)),
+            commands="\n".join(commands),
         )
 
-        dumpfile = os.path.join(profiler_dir, "Makefile")
+        # make the Makefile name dependent on the built target names
+        target_names_str = "_".join(sorted(target_names))  # stable order
+        makefile_suffix = sha1(target_names_str.encode("utf-8")).hexdigest()
+        makefile_name = f"Makefile_{makefile_suffix}"
+        dumpfile = os.path.join(profiler_dir, makefile_name)
         with open(dumpfile, "w+") as f:
-            # fix the makefile indentation
-            f.write(re.sub("^    ", "\t", makefile_str, flags=re.M))
+            f.write(makefile_str)
+
+        return makefile_name
 
     def make_profilers(self, generated_profilers, workdir):
         file_pairs = [f for gp in generated_profilers for f in gp]
         if not file_pairs:
             return
         build_dir = shlex.quote(os.path.join(workdir, "profiler"))
-        self._gen_makefile_for_profilers(file_pairs, build_dir)
+        makefile_name = self._gen_makefile_for_profilers(file_pairs, build_dir)
+        # Write compiler version string(s) into build directory, so these can be used as part of cache key
+        self._gen_compiler_version_files(build_dir)
+
+        # hash all .bin files and write hash into it, so we can use their hash to build the cache key,
+        # even if we delete the actual .bin file afterwards
+        write_binhash_file(build_dir)
+
         make_path = shlex.quote(Target.current().make())
         make_flags = " ".join(
             [
+                f"-f {makefile_name}",
                 "--output-sync",
                 f"-C {build_dir}",
             ]
@@ -376,10 +831,66 @@ def make_profilers(self, generated_profilers, workdir):
         make_clean_cmd = f" {make_path} {make_flags} clean "
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         cmds = [make_clean_cmd, make_all_cmd]
-        _run_make_cmds(cmds, self._timeout)
+        _run_make_cmds(
+            cmds,
+            self._timeout,
+            build_dir,
+            allow_cache=(not environ.ait_build_cache_skip_profiler()),
+        )
+
+    def _gen_compiler_version_files(self, target_dir):
+        # Write compiler version string(s) into build directory
+        # for cache invalidation purposes (different compiler versions
+        # should not reuse same cached build artifacts )
+        cc = Target.current().cc()
+        compilers = {"main_compiler": cc}
+        if "nvcc" in cc:
+            ccbin_match = re.search(r'-ccbin "?([^ "]+)', cc)
+            if ccbin_match:
+                nvcc_host_compiler = ccbin_match.group(1)
+            else:
+                nvcc_host_compiler = "g++"  # default, using PATH resolution
+            compilers["nvcc_host_compiler"] = nvcc_host_compiler
+
+        # Write compiler version string(s)
+        # into the build directory, to enable using them for cache hash determination
+        for compiler_name, compiler_cmd in compilers.items():
+            try:
+                version_bytes = subprocess.check_output([compiler_cmd, "--version"])
+                with open(
+                    os.path.join(target_dir, compiler_name + ".version"),
+                    "wb",  # version_bytes is bytes obj
+                ) as fh:
+                    fh.write(version_bytes)
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                _LOGGER.warn("CACHE: Could not determine version of host compiler.")
+                # This will always invalidate the cache, due to the inclusion of a timestamp
+                with open(
+                    os.path.join(target_dir, compiler_name + ".error.version"),
+                    "w",
+                    encoding="utf-8",
+                ) as fh:
+                    fh.write(f"Could not determine version of {compiler_cmd}\n")
+
+    def make(
+        self,
+        file_pairs,
+        dll_name,
+        workdir,
+        test_name,
+        debug_settings=_DEBUG_SETTINGS,
+        allow_cache=True,
+    ):
+        self.gen_makefile(file_pairs, dll_name, workdir, test_name, debug_settings)
+        self.postprocess_build_dir(workdir, test_name)
+
+        # Write compiler version string(s) into build directory, so these can be used as part of cache key
+        self._gen_compiler_version_files(os.path.join(workdir, test_name))
+
+        # hash all .bin files and write hash into it, so we can use their hash to build the cache key,
+        # even if we delete the actual .bin file afterwards
+        write_binhash_file(os.path.join(workdir, test_name))
 
-    def make(self, file_pairs, dll_name, workdir, test_name):
-        self.gen_makefile(file_pairs, dll_name, workdir, test_name)
         make_path = shlex.quote(Target.current().make())
         build_dir = shlex.quote(os.path.join(workdir, test_name))
         make_flags = " ".join(
@@ -392,6 +903,17 @@ def make(self, file_pairs, dll_name, workdir, test_name):
         make_all_cmd = f" {make_path} {make_flags} -j{self._n_jobs} all "
         make_clean_constants_cmd = f" {make_path} {make_flags} clean_constants "
         cmds = [make_clean_cmd, make_all_cmd]
-        if not logger.is_debug():
+        if not is_debug():
             cmds.append(make_clean_constants_cmd)
-        _run_make_cmds(cmds, self._timeout)
+        _run_make_cmds(cmds, self._timeout, build_dir, allow_cache=allow_cache)
+
+
+def get_compile_engine():
+    if is_cmake_compilation():
+        from aitemplate.backend.cuda import builder_cmake
+
+        compile_engine = builder_cmake.BuilderCMake()
+    else:
+        compile_engine = Builder()
+
+    return compile_engine
diff --git a/python/aitemplate/backend/codegen.py b/python/aitemplate/backend/codegen.py
index 6ad72b854..79e8fea7f 100644
--- a/python/aitemplate/backend/codegen.py
+++ b/python/aitemplate/backend/codegen.py
@@ -23,23 +23,39 @@
 from __future__ import annotations
 
 import io
+import json
+import logging
 import os
+from collections import defaultdict
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
+import jinja2
+
+from aitemplate.backend import registry
+
 from aitemplate.backend.main_templates import MODEL_CONTAINER_TEMPLATE, MODEL_TEMPLATE
-from aitemplate.compiler.base import Operator
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 from aitemplate.compiler.dtype import dtype_to_enumerator, get_dtype_size
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 from aitemplate.compiler.transform.memory_planning import Workspace
-from aitemplate.utils import logger
-
-from ..compiler.base import IntImm, IntVar, IntVarTensor, Tensor
-from . import registry
-from .target import Target
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.environ import (
+    multistream_additional_streams,
+    multistream_max_mem_parallel_ops,
+    multistream_mode,
+)
+from aitemplate.utils.graph_utils import split_simple_multistream_parallel_ops
+from aitemplate.utils.misc import is_debug
 
 # pylint: disable=C0103,W0613,C0301
 
+
+_LOGGER = logging.getLogger(__name__)
+
 DTYPE_TO_POINTERTYPE: Dict[str, str] = {
     "float32": "float*",
     "float": "float*",
@@ -50,12 +66,16 @@
 }
 
 
-def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_strategy):
+CONSTANT_FOLDER_MODEL_NAME = "ConstantFolder"
+MODEL_NAME = "Model"
+
+
+def gen_profiler(sorted_graph: List[Tensor], workdir: str, dynamic_profiling_strategy):
     """Generate operator profiler source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     workdir : str
         Target directory for generated C++ source code files
@@ -73,13 +93,13 @@ def gen_profiler(sorted_graph: list[Tensor], workdir: str, dynamic_profiling_str
 
 
 def gen_function_src(
-    sorted_graph: list[Tensor], workdir: str, model_name: str = ""
-) -> list[Tuple[str, str]]:
+    sorted_graph: List[Tensor], workdir: str, model_name: str = ""
+) -> List[Tuple[str, str]]:
     """Generate functions source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     workdir : str
         Target directory for generated C++ source code files
@@ -88,7 +108,7 @@ def gen_function_src(
 
     Returns
     -------
-    list[Tuple[str, str]]
+    List[Tuple[str, str]]
         List of tuple (source file path, object file path)
     """
     target = Target.current()
@@ -105,7 +125,7 @@ def gen_function_src(
                 with open(src_path, "w") as fo:
                     fo.write(func.gen_function())
                 exist_func.add(fname)
-    logger.info(__name__, f"generated {len(file_pairs)} function srcs")
+    _LOGGER.info(f"generated {len(file_pairs)} function srcs")
     return file_pairs
 
 
@@ -237,6 +257,30 @@ def check_not_null(
     """
 
 
+def extract_input_output_shapes(func_attrs):
+    if "input_accessors" in func_attrs:
+        input_shape = [
+            [v.pseudo_code() for v in acc.original_shapes]
+            for acc in func_attrs["input_accessors"]
+        ]
+    else:
+        input_shape = [
+            [v.pseudo_code() for v in t.shape()] for t in func_attrs["inputs"]
+        ]
+
+    if "output_accessors" in func_attrs:
+        output_shape = [
+            [v.pseudo_code() for v in acc.original_shapes]
+            for acc in func_attrs["output_accessors"]
+        ]
+
+    else:
+        output_shape = [
+            [v.pseudo_code() for v in t.shape()] for t in func_attrs["outputs"]
+        ]
+    return input_shape, output_shape
+
+
 def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
     src_name = src_tensor._attrs["name"]
     dst_ptr = f"params_[{dst_idx}].ptr"
@@ -251,22 +295,52 @@ def device_copy(dst_tensor: Tensor, src_tensor: Tensor, dst_idx: int) -> str:
     return f"DEVICE_CHECK(DeviceToDeviceCopy({dst_ptr}, {src_name}, {size}, stream));"
 
 
+def _construct_output_name_to_index_map(
+    sorted_graph: List[Tensor], output_tensors: List[Tensor]
+) -> Dict[str, int]:
+    """
+    Use the given output ordering to construct a name -> index map
+    to be used for constructing an internal ordering during codegen.
+
+    The indices in the map are propagated to an output's entire alias set.
+    If two outputs are part of the same alias set, only one of them propagates
+    its output index.
+    """
+    result = {tensor._attrs["name"]: i for i, tensor in enumerate(output_tensors)}
+
+    # Mark alias sets
+    for tensor in reversed(sorted_graph):
+        name = tensor._attrs["name"]
+        orig = tensor._attrs["is_view_of"]
+        if orig is None:
+            continue
+        orig_name = orig._attrs["name"]
+        if name in result and orig_name not in result:
+            result[orig_name] = result[name]
+
+    return result
+
+
 class ModelContainerGenerator:
     def __init__(
         self,
         max_blob_size: int,
         max_constant_blob_size: int,
         workspace: Workspace,
-        num_inputs: int,
-        num_outputs: int,
-        constants_data_file: io.BytesIO,
-        output_name_to_idx: Dict[str, int],
-        check_all_nan_and_inf: bool = False,
-        check_all_outputs: bool = False,
+        constants_data_file: Optional[io.BytesIO],
+        graph: List[Tensor],
+        output_tensors: List[Tensor],
+        model_name: str = MODEL_NAME,
+        additional_unbound_constants: Optional[List[Tensor]] = None,
+        debug_settings: Optional[AITDebugSettings] = None,
+        model_dir: Optional[str] = None,
     ):
         self.target = Target.current()
         self.f_var_decl = registry.get(self.target.name() + ".lib.var_decl")
         self.f_ptr_decl = registry.get(self.target.name() + ".lib.void_ptr_decl")
+        self.dtype_to_backend_type = registry.get(
+            self.target.name() + ".lib.dtype_to_backend_type"
+        )
 
         self.constants_data_file = constants_data_file
 
@@ -275,14 +349,21 @@ def __init__(
         self.tensor_slice = []
         self.tensor_map_set = []
         self.set_inputs = []
+        self.func_name_seq = []
         self.func_seq = []
+        self._input_shape_seq = []
+        self._output_shape_seq = []
+        self.func_prop_seq = []
         self.tensor_decl = []
         self.dim_decl = []
+        self.jagged_decl = []
         self.device_to_device_copies = []
         self.function_state = []
         self.set_up_constants = []
         self.set_up_param_names = []
         self.set_up_param_dtypes = []
+        self.set_up_bound_constant_dtypes = []
+        self.set_up_bound_constant_size = []
         self.set_up_output_shapes = []
         self.set_up_param_dynamic_shapes = []
         self.state_record = set()
@@ -294,31 +375,53 @@ def __init__(
         self.num_constants = 0
         self.constants_data_size = 0
         self.owned_constants_init = []
+        self.reset_constants = []
+
+        self.set_up_bound_constant_offsets = []
+        self.set_up_constant_folding_outputs_offsets = []
 
         self.input_idx = 0
+        self.bound_constant_idx = 0
         self.unbound_constant_idx = 0
-        self.output_name_to_idx = output_name_to_idx
-
-        (
-            self.max_blob_size,
-            self.max_constant_blob_size,
-            self.workspace,
-            self.num_inputs,
-            self.num_outputs,
-        ) = (
-            max_blob_size,
-            max_constant_blob_size,
-            workspace,
-            num_inputs,
-            num_outputs,
+        self.output_name_to_idx = _construct_output_name_to_index_map(
+            graph, output_tensors
         )
+        self.graph = graph
 
-        self.check_all_nan_and_inf = check_all_nan_and_inf
-        self.check_all_outputs = check_all_outputs
+        self.num_inputs, self.num_outputs = count_inputs_outputs(graph)
+        self.max_blob_size = max_blob_size
+        self.max_constant_blob_size = max_constant_blob_size
+        self.workspace = workspace
+
+        self.debug_settings = (
+            AITDebugSettings() if debug_settings is None else debug_settings
+        )
 
         # This records whether or not we should debug header.
         self.debug_header = False
 
+        self.model_name = model_name
+
+        # This is needed for logging activities only.
+        self.model_dir = model_dir
+
+        # additional_unbound_constants stores tensors that are used in constant folding
+        # but are not used in the main graph. We need this info so we can codegen SetConstant
+        # correctly; when we call SetConstant for one of these special names, we want to forward
+        # to constant_folder_->SetConstant().
+        self.additional_unbound_constants = additional_unbound_constants
+        self.set_up_constant_folding_inputs = []
+
+        # This is used to handle a corner case; if we have an owned tensor that is used as an input
+        # for constant folding, we need to allocate space for it in our constant buffer, but its
+        # size won't be found during memory planning.
+        self.extra_owned_constant_size = 0
+
+        # This is a temporary dictionary that holds the rendered C++ code for operators.
+        self._rendered_func_code: Dict[Operator, str] = {}
+        # This is a temporary list that holds rendered C++ code for checks.
+        self._rendered_checks_func_code: List[str] = []
+
     def _tensor_slice_func(
         self,
         node: Tensor,
@@ -360,6 +463,59 @@ def max_value(var_or_imm):
             )
         )
 
+    def _add_owned_constant(self, tensor: Tensor) -> None:
+        """
+        Add an owned constant, e.g. one with a bound "data" attribute.
+        Here, we codegen some extra logic to load it into memory from the .so.
+        """
+        assert (
+            self.constants_data_file is not None
+        ), "Cannot add owned constants without a data file"
+
+        name = tensor._attrs["name"]
+        data = tensor._attrs["data"]
+        assert (
+            tensor._attrs["offset"] >= 0
+        ), f"Constant node '{name}' must have non-negative offset"
+        num_bytes = len(data)
+        self.constants_data_file.write(data.to_bytes())
+
+        constant_info = f'ConstantInfo{{"{name}", {self.constants_data_size}, {tensor._attrs["offset"]}, {num_bytes}}}'
+        self.owned_constants_init.append(constant_info)
+        self.constants_data_size += num_bytes
+        self.num_constants += 1
+
+    def _codegen_bound_constant(self, tensor: Tensor) -> None:
+        if tensor._attrs.get("is_internal_constant", False):
+            return
+
+        name = tensor._attrs["name"]
+        self.set_up_constant_names.append(
+            set_value(
+                f'bound_constant_name_to_idx_["{name}"]',
+                self.bound_constant_idx,
+            )
+        )
+        self.set_up_bound_constant_dtypes.append(
+            set_value(
+                f"bound_constant_dtypes_[{self.bound_constant_idx}]",
+                dtype_to_enumerator(tensor.dtype()),
+            )
+        )
+        self.set_up_bound_constant_size.append(
+            set_value(
+                f"bound_constant_size_[{self.bound_constant_idx}]",
+                len(tensor._attrs["data"]),
+            )
+        )
+        self.set_up_bound_constant_offsets.append(
+            set_value(
+                f"bound_constant_offsets_[{self.bound_constant_idx}]",
+                tensor._attrs["offset"],
+            )
+        )
+        self.bound_constant_idx += 1
+
     def _codegen_param_setup(
         self,
         tensor: Tensor,
@@ -369,19 +525,31 @@ def _codegen_param_setup(
         """
         name = tensor._attrs["name"]
         data = tensor._attrs["data"]
+        const_slice = self._tensor_slice_func(tensor, "constants")
         if data is not None:
             # Owned constant. Set up logic for copying the constant in from *.so.
-            assert (
-                tensor._attrs["offset"] >= 0
-            ), f"Constant node '{name}' must have non-negative offset"
-            self.set_up_constants.append(self._tensor_slice_func(tensor, "constants"))
-            num_bytes = len(data)
-            self.constants_data_file.write(data.to_bytes())
-
-            constant_info = f'ConstantInfo{{"{name}", {self.constants_data_size}, {tensor._attrs["offset"]}, {num_bytes}}}'
-            self.owned_constants_init.append(constant_info)
-            self.constants_data_size += num_bytes
-            self.num_constants += 1
+            self.set_up_constants.append(const_slice)
+            self.set_up_constants.append(
+                set_value(
+                    f'constant_name_to_ptr_["{name}"]',
+                    f"const_cast<const void**>(reinterpret_cast<void**>(&{name}))",
+                )
+            )
+            self._codegen_bound_constant(tensor)
+            if not tensor._attrs.get("is_internal_constant", False):
+                self.reset_constants.append(const_slice)
+            if self.constants_data_file is not None:
+                self._add_owned_constant(tensor)
+        elif tensor._attrs["constant_folding_output_idx"] is not None:
+            self.set_up_constant_folding_outputs_offsets.append(
+                set_value(
+                    f'constant_folding_outputs_offsets_[{tensor._attrs["constant_folding_output_idx"]}]',
+                    tensor._attrs["offset"],
+                )
+            )
+            self.tensor_slice.append(const_slice)
+            if not tensor._attrs.get("is_internal_constant", False):
+                self.reset_constants.append(const_slice)
         elif not isinstance(tensor, IntVarTensor):
             # Unbound constant. We will expect the user to set this via SetConstant.
             self.set_up_constant_names.append(
@@ -435,7 +603,7 @@ def _codegen_output_aliases_tensor(self, tensor: Tensor) -> None:
             self.set_inputs.append(set_value(name, view._attrs["name"]))
             return
         is_view = view is not None
-        if is_view:
+        if is_view and (view._attrs["name"] in self.param_name_to_ptr_idx):
             ptr_idx = self.param_name_to_ptr_idx[view._attrs["name"]]
             self.set_inputs.append(set_value(name, view._attrs["name"]))
         else:
@@ -507,6 +675,42 @@ def _process_dims(self, shape: List[IntVar]) -> None:
             self.dim_decl.append(self.f_var_decl(dim._attrs["name"], intimm))
             self.visited_dims.add(dim._attrs["name"])
 
+    def _process_jagged_dims(self, node: Tensor) -> None:
+        # JaggedIntVars are processed separately here (besides being processed
+        # like normal IntVars in _process_dims above), as they require adding
+        # the offset structure declaration into the Model codegen, as well as
+        # the batch_dim if it's not set when processing other tensors that
+        # directly contain the batch_dim it in their shapes
+        jagged_int_var = node._attrs["shape"][0]
+        name = jagged_int_var._attrs["name"]
+
+        # we use the key with a prefix here, as the JaggedIntVar's name
+        # is identical to the name of the total_length it is based on,
+        # which might have been traversed already
+        key = f"jagged_int_var_{name}"
+        if key not in self.visited_dims:
+            for i, jagged_dim in enumerate(jagged_int_var.jagged_dims()):
+                if jagged_dim.offsets() is None:
+                    raise RuntimeError(
+                        f"No offsets Tensor is associated with the JaggedDim {i} in "
+                        f"the JaggedIntVar {name}: can't generate offset-related code."
+                    )
+            self.jagged_decl.append(
+                f"   {jagged_int_var.offsets_struct_type()} "
+                f"{jagged_int_var.offsets_var_name()};"
+            )
+            self.visited_dims.add(key)
+
+        batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
+        if batch_dim_name not in self.visited_dims:
+            batch_dim_value = (
+                0
+                if not isinstance(jagged_int_var.batch_dim(), IntImm)
+                else jagged_int_var.batch_dim().value()
+            )
+            self.dim_decl.append(self.f_var_decl(batch_dim_name, batch_dim_value))
+            self.visited_dims.add(batch_dim_name)
+
     def _process_dims_for_tensor(self, node: Tensor) -> None:
         self._process_dims(node._attrs["shape"])
 
@@ -543,7 +747,25 @@ def _process_src_ops(self, node: Tensor) -> None:
             # We use original_name here because it's unique.
             if func._attrs["original_name"] not in self.visited_func:
                 self.visited_func.add(func._attrs["original_name"])
-                self.func_seq.append(f_func_call(func._attrs, indent="    "))
+                seq = f_func_call(func._attrs, indent="    ")
+                if self.debug_settings.gen_profiler_annotation:
+                    seq = f'  {{\n  RAII_ProfilerRange _raiiOpProfilerRange("{func._attrs["outputs"][0]._attrs["name"]}");\n{seq}\n  }}'
+                self.func_name_seq.append(func._attrs["original_name"])
+                self.func_seq.append(seq)
+                input_shape, output_shape = extract_input_output_shapes(func._attrs)
+                self._input_shape_seq.append(input_shape)
+                self._output_shape_seq.append(output_shape)
+                props = {}
+                for item in func._args_for_pseudo_code():
+                    res = item.split("=")
+                    if len(res) == 2:
+                        res[1] = res[1].replace("\n", " ")
+                        props[res[0]] = res[1]
+                self.func_prop_seq.append(props)
+
+                # save the rendered code for the future
+                self._rendered_func_code[func] = seq
+
             if "int_state_flag" in func._attrs:
                 if func._attrs["name"] not in self.state_record:
                     self.function_state.append(
@@ -552,26 +774,40 @@ def _process_src_ops(self, node: Tensor) -> None:
                     self.state_record.add(func._attrs["name"])
             self._process_dims_for_op(func)
 
-        if self.check_all_nan_and_inf or node._attrs.get("check_nan_and_inf", False):
+        if (
+            self.debug_settings.check_all_nan_and_inf
+            or node._attrs.get("check_nan_and_inf", False)
+        ) and (not isinstance(node, IntVarTensor)):
             self._append_check_nan_and_inf(node)
-        if self.check_all_outputs or node._attrs.get("check_outputs", False):
+        if (
+            self.debug_settings.check_all_outputs
+            or node._attrs.get("check_outputs", False)
+        ) and (not isinstance(node, IntVarTensor)):
             self._append_check_outputs(node)
 
     def _append_check_nan_and_inf(self, node: Tensor):
         self.debug_header = True
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
-        self.func_seq.append(
-            f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
-        )
+        self.func_name_seq.append("nan_and_inf_check")
+
+        code_text = f'    InvokeInfAndNanChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        self.func_seq.append(code_text)
+        self._rendered_checks_func_code.append(code_text)
 
     def _append_check_outputs(self, node: Tensor):
         self.debug_header = True
         tensor_name = node._attrs["name"]
         elem_cnt = "*".join([shape.pseudo_code() for shape in node.shape()])
-        self.func_seq.append(
-            f'    InvokeOutputsChecker(reinterpret_cast<half*>({tensor_name}), "{tensor_name}", {elem_cnt}, stream);\n'
+        self.func_name_seq.append("output_check")
+
+        backend_type = self.dtype_to_backend_type(node._attrs["dtype"])
+        code_text = (
+            f"    InvokeOutputsChecker((const {backend_type}*)({tensor_name}), "
+            f'"{tensor_name}", {elem_cnt}, stream);\n'
         )
+        self.func_seq.append(code_text)
+        self._rendered_checks_func_code.append(code_text)
 
     def append_tensor(self, node: Tensor) -> None:
         if node._attrs["nop"]:
@@ -579,7 +815,18 @@ def append_tensor(self, node: Tensor) -> None:
         name = node._attrs["name"]
         dtype = node._attrs["dtype"]
         if isinstance(node, IntVarTensor):
-            self.tensor_decl.append(self.f_var_decl(name=name))
+            # Check to prevent duplicate declaration in case IntVarTensor is already declared from dims for another tensor
+            if node._attrs["name"] in self.visited_dims:
+                return
+            int_var = node._attrs["int_var"]
+            if isinstance(int_var, IntImm):
+                self.tensor_decl.append(
+                    self.f_var_decl(name=name, value=int_var._attrs["values"][0])
+                )
+            else:
+                self.tensor_decl.append(self.f_var_decl(name=name))
+            # IntVarTensor could be used as dim too, add to visited to prevent duplicated declaration.
+            self.visited_dims.add(name)
         else:
             self.tensor_decl.append(self.f_ptr_decl(name=name, dtype=dtype))
 
@@ -617,24 +864,106 @@ def append_tensor(self, node: Tensor) -> None:
         self._process_dims_for_tensor(node)
         self._process_src_ops(node)
 
-    def generate_source(self) -> Dict[str, str]:
-        """
-        Perform the codegen after adding all tensors.
-        The dictionary returned is a map from filename -> contents.
-        """
-        device_functions_header_name = f"{self.target.name()}_device_functions.h"
-        result = {}
-        result[
-            "device_functions-generated.h"
-        ] = f'#include "{device_functions_header_name}"'
+        if node.is_jagged():
+            self._process_jagged_dims(node)
+
+    def _generate_simple_multistream_ops(
+        self,
+    ) -> List[List[Operator]]:
+        from aitemplate.utils.graph_utils import track_graph_timings
+
+        # track the sequence
+        time_stats = track_graph_timings(self.graph, {})
+
+        # sort all operators by parallel execution order
+        ops_by_order = defaultdict(list)
+        for op, tracking in time_stats.op_parallel_trackers.items():
+            ops_by_order[tracking.execution_order].append(op)
+
+        # convert Dict[int, List[Operator]] into List[List[Operator]]
+        max_parallel_ops = multistream_max_mem_parallel_ops()
+        ops = split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops)
 
-        # Disable graph mode on ROCM because the updating operations
-        # are not supported
-        target_has_graph_mode = "true" if self.target.name() == "cuda" else "false"
+        # done
+        return ops
 
-        model_def = MODEL_TEMPLATE.render(
+    def _write_simple_multistream_debug_info(
+        self, par_ops_seq: List[List[Operator]]
+    ) -> None:
+        # store simple multistream information to log
+
+        # render ops into names
+        ops_names = [
+            [op._attrs["original_name"] for op in par_ops] for par_ops in par_ops_seq
+        ]
+
+        # write text
+        log_filename_txt = (
+            Path(self.model_dir) / f"simple_multistream_{self.model_name}.txt"
+        )
+        with open(log_filename_txt, "w") as log_f:
+            for idx, ops_list in enumerate(ops_names):
+                ops_string = " ".join(ops_list)
+                log_f.write(f"{idx}: {ops_string}\n")
+        _LOGGER.info(f"Wrote text simple multistream info into {log_filename_txt}")
+
+        # write json
+        log_filename_json = (
+            Path(self.model_dir) / f"simple_multistream_{self.model_name}.json"
+        )
+        with open(log_filename_json, "w") as log_f:
+            log_f.write(f"{json.dumps(ops_names)}\n")
+        _LOGGER.info(f"Wrote json simple multistream info into {log_filename_json}")
+
+    def generate_model(self) -> str:
+        target_has_graph_mode = "true"
+
+        run_impl_mode = multistream_mode()
+        if run_impl_mode == 0:
+            # no multistream mode is used
+            n_additional_streams = 0
+            n_additional_events = 0
+            par_function_seq = None
+            par_check_function_seq = []
+        elif run_impl_mode == 1:
+            # spawn additional streams. Total number of streams will be
+            #   n_additional_streams + 1.
+            n_additional_streams = multistream_additional_streams()
+            n_additional_events = n_additional_streams
+
+            # generate List[List[Operator]]
+            par_ops_seq = self._generate_simple_multistream_ops()
+
+            for par_ops in par_ops_seq:
+                _LOGGER.info(
+                    f"Executing in parallel: {' '.join([op._attrs['original_name'] for op in par_ops])}"
+                )
+
+            # convert List[List[Operator]] into List[List[str]]
+            par_function_seq = [
+                [self._rendered_func_code[op] for op in par_ops]
+                for par_ops in par_ops_seq
+            ]
+
+            # prepare after-ops checks
+            par_check_function_seq = self._rendered_checks_func_code
+
+            # dump info to files for further debugging, if needed
+            if is_debug() and self.model_dir is not None:
+                self._write_simple_multistream_debug_info(par_ops_seq)
+        else:
+            raise Exception(f"Unsupported multistream mode ({run_impl_mode})")
+
+        per_op_profiler_seq = zip(
+            self.func_name_seq,
+            self.func_seq,
+            self._input_shape_seq,
+            self._output_shape_seq,
+            self.func_prop_seq,
+        )
+        return MODEL_TEMPLATE.render(
+            model_name=self.model_name,
             function_decl="\n".join(self.func_decl),
-            device_functions_header=device_functions_header_name,
             set_inputs="\n".join(self.set_inputs),
             tensor_slice="\n".join(self.tensor_slice),
             tensor_map_set="\n".join(self.tensor_map_set),
@@ -642,77 +971,173 @@ def generate_source(self) -> Dict[str, str]:
             device_to_device_copies="\n".join(self.device_to_device_copies),
             set_up_param_dynamic_shapes="\n".join(self.set_up_param_dynamic_shapes),
             function_seq=self.func_seq,
+            per_op_profiler_seq=per_op_profiler_seq,
             tensor_decl="\n".join(self.tensor_decl),
             dim_decl="\n".join(self.dim_decl),
+            jagged_decl="\n".join(self.jagged_decl),
             function_state="\n".join(self.function_state),
             target_has_graph_mode=target_has_graph_mode,
             unique_workspace_size=self.workspace.unique_size,
             debug_header=self.debug_header,
+            blob_size=self.max_blob_size,
+            workspace_size=self.workspace.total_size(),
+            num_inputs=self.num_inputs,
+            num_outputs=self.num_outputs,
+            param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
+            num_unbound_constants=self.unbound_constant_idx,
+            reset_constants="\n".join(self.reset_constants),
+            profiler_annotation=self.debug_settings.gen_profiler_annotation,
+            n_additional_streams=n_additional_streams,
+            n_additional_events=n_additional_events,
+            par_function_seq=par_function_seq,
+            par_check_function_seq=par_check_function_seq,
+            run_impl_mode=run_impl_mode,
         )
 
-        result["model-generated.h"] = model_def
+    def _create_set_up_constant_offsets(self) -> str:
+        """
+        bound_constant_offsets_ stores a map for each constant to the offset in constant buffer,
+        constant_folding_outputs_offsets_ stores a map from each output of constant folding
+        to its offset inside the constant buffer.
+
+
+        When the model is loaded, we use these offsets to wire up the constant folding output
+        pointers to the outputs of the constant folder.
+        """
+        constant_offsets = ""
+        if self.set_up_constant_folding_outputs_offsets:
+            constant_offsets = jinja2.Template(
+                """
+    constant_folding_outputs_offsets_.resize({{num_constant_folding_outputs}});
+    {{set_up_statements}}
+    """
+            ).render(
+                num_constant_folding_outputs=len(
+                    self.set_up_constant_folding_outputs_offsets
+                ),
+                set_up_statements="\n".join(
+                    self.set_up_constant_folding_outputs_offsets
+                ),
+            )
+            constant_offsets += "\n"
+        if self.set_up_bound_constant_offsets:
+            constant_offsets += jinja2.Template(
+                """
+    bound_constant_offsets_.resize({{num_bound_constant_offsets}});
+    {{set_up_statements}}
+    """
+            ).render(
+                num_bound_constant_offsets=len(self.set_up_bound_constant_offsets),
+                set_up_statements="\n".join(self.set_up_bound_constant_offsets),
+            )
+        return constant_offsets
+
+    def generate_source(self) -> Dict[str, str]:
+        """
+        Perform the codegen after adding all tensors.
+        The dictionary returned is a map from filename -> contents.
+        """
+        device_functions_header_name = f"{self.target.name()}_device_functions.h"
+        result = {}
+        result[
+            "device_functions-generated.h"
+        ] = f'#include "{device_functions_header_name}"'
+
+        result["model-generated.h"] = self.generate_model()
 
         model_container_src_fname = f"model_container_base{self.target.src_extension()}"
+
         model_container_base_src = MODEL_CONTAINER_TEMPLATE.render(
-            blob_size=self.max_blob_size,
-            workspace_size=self.workspace.total_size(),
             num_inputs=self.num_inputs,
             num_outputs=self.num_outputs,
-            param_size=self.max_constant_blob_size,
+            param_size=self.max_constant_blob_size + self.extra_owned_constant_size,
             set_up_constant_names="\n".join(self.set_up_constant_names),
             set_up_param_dtypes="\n".join(self.set_up_param_dtypes),
+            set_up_bound_constant_dtypes="\n".join(self.set_up_bound_constant_dtypes),
+            set_up_bound_constant_size="\n".join(self.set_up_bound_constant_size),
             set_up_output_shapes="\n".join(self.set_up_output_shapes),
             set_up_param_names="\n".join(self.set_up_param_names),
             num_constants=self.num_constants,
+            num_bound_constants=self.bound_constant_idx,
             num_unbound_constants=self.unbound_constant_idx,
             owned_constants_init=",".join(self.owned_constants_init),
+            set_up_constant_offsets=self._create_set_up_constant_offsets(),
+            set_up_constant_folding_inputs="\n".join(
+                self.set_up_constant_folding_inputs
+            ),
+            # # todo: enable once this feature is fully available
+            # is_windows=is_windows(),
         )
         result[model_container_src_fname] = model_container_base_src
         return result
 
+    def add_constant_folding_input(self, tensor: Tensor):
+        """
+        Handle an input to constant fold
+        Handle an input to constant folding, e.g. a constant that is
+        no longer part of the main graph
+        """
+        name = tensor._attrs["name"]
 
-def _construct_output_name_to_index_map(
-    sorted_graph: List[Tensor], output_tensors: List[Tensor]
-) -> Dict[str, int]:
-    """
-    Use the given output ordering to construct a name -> index map
-    to be used for constructing an internal ordering during codegen.
+        if tensor._attrs["data"] is None:
+            self.set_up_constant_names.append(
+                set_value(
+                    f'unbound_constant_name_to_idx_["{name}"]',
+                    self.unbound_constant_idx,
+                )
+            )
+            self._record_param_tensor_info(
+                tensor,
+                self.unbound_constant_idx + self.num_inputs + self.num_outputs,
+            )
+            self.unbound_constant_idx += 1
+            self.set_up_constant_folding_inputs.append(
+                f'constant_folding_inputs_.insert("{name}");'
+            )
+        else:
+            self._add_owned_constant(tensor)
+            self._codegen_bound_constant(tensor)
+            self.set_up_constant_folding_inputs.append(
+                f'constant_folding_optional_inputs_.insert("{name}");'
+            )
 
-    The indices in the map are propagated to an output's entire alias set.
-    If two outputs are part of the same alias set, only one of them propagates
-    its output index.
-    """
-    result = {tensor._attrs["name"]: i for i, tensor in enumerate(output_tensors)}
+        self._process_dims_for_tensor(tensor)
 
-    # Mark alias sets
-    for tensor in reversed(sorted_graph):
-        name = tensor._attrs["name"]
-        orig = tensor._attrs["is_view_of"]
-        if orig is None:
-            continue
-        orig_name = orig._attrs["name"]
-        if name in result and orig_name not in result:
-            result[orig_name] = result[name]
+    def append_all_tensors(self) -> None:
+        if self.additional_unbound_constants is not None:
+            for tensor in self.additional_unbound_constants:
+                self.add_constant_folding_input(tensor)
+                self.extra_owned_constant_size += tensor.size_bytes(alignment=64)
 
-    return result
+        for tensor in self.graph:
+            if tensor._attrs["is_param"] and tensor._attrs["offset"] is not None:
+                # Make sure we leave room for the tensors that constant folding
+                # needs. These have been excluded from the final graph, so
+                # the memory planning pass will not have known about them.
+                tensor._attrs["offset"] += self.extra_owned_constant_size
+
+            self.append_tensor(tensor)
+
+
+_DEBUG_SETTINGS = AITDebugSettings()
 
 
 def gen_library_src(  # noqa: C901
-    sorted_graph: list[Tensor],
+    sorted_graph: List[Tensor],
     max_blob_size: int,
     max_constant_blob_size: int,
     workspace: Workspace,
     workdir: str,
     output_tensors: List[Tensor],
     model_name: str = "",
-    check_all_nan_and_inf: bool = False,
-    check_all_outputs: bool = False,
-) -> list[Tuple[str, str]]:
+    debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+    additional_unbound_constants: Optional[List[Tensor]] = None,
+) -> List[Tuple[str, str]]:
     """Generate model driver source code files for the given graph
 
     Parameters
     ----------
-    sorted_graph : list[Tensor]
+    sorted_graph : List[Tensor]
         The network after running toposort transformation
     max_blob_size : int
         Total memory for input/output tensor and intermediate results,
@@ -723,10 +1148,12 @@ def gen_library_src(  # noqa: C901
         Target directory for generated C++ source code files
     model_name : str, optional
         Sub working directory in the workdir for the given model, by default ""
+    debug_settings : AITDebugSettings
+        specify debug settings such as where to dump AITemplate model Python file, etc.
 
     Returns
     -------
-    list[Tuple[str, str]]
+    List[Tuple[str, str]]
         List of tuple (source file path, object file path)
     """
 
@@ -734,28 +1161,22 @@ def to_obj_name(name: str):
         name, _ = os.path.splitext(name)
         return f"{name}.obj"
 
-    num_inputs, num_outputs = count_inputs_outputs(sorted_graph)
     prefix = os.path.join(workdir, model_name)
     constants_fname = os.path.join(prefix, "constants.bin")
     constants_data_file = open(constants_fname, "wb")
 
-    output_name_to_index = _construct_output_name_to_index_map(
-        sorted_graph, output_tensors
-    )
-
     model_container_generator = ModelContainerGenerator(
         max_blob_size,
         max_constant_blob_size,
         workspace,
-        num_inputs,
-        num_outputs,
         constants_data_file,
-        output_name_to_index,
-        check_all_nan_and_inf,
-        check_all_outputs,
+        sorted_graph,
+        output_tensors,
+        additional_unbound_constants=additional_unbound_constants,
+        debug_settings=debug_settings,
+        model_dir=prefix,
     )
-    for node in sorted_graph:
-        model_container_generator.append_tensor(node)
+    model_container_generator.append_all_tensors()
     constants_data_file.close()
 
     files = model_container_generator.generate_source()
@@ -772,5 +1193,5 @@ def to_obj_name(name: str):
     for fname in sources:
         to_build.append((fname, to_obj_name(fname)))
 
-    logger.info(__name__, f"generated {len(to_build)} library srcs")
+    _LOGGER.info(f"generated {len(to_build)} library srcs")
     return to_build
diff --git a/python/aitemplate/backend/common/concatenate_common.py b/python/aitemplate/backend/common/concatenate_common.py
index 001afe0ac..025688309 100644
--- a/python/aitemplate/backend/common/concatenate_common.py
+++ b/python/aitemplate/backend/common/concatenate_common.py
@@ -17,7 +17,9 @@
 """
 import jinja2
 
-from . import tensor_accessor_codegen
+from aitemplate.backend.common import tensor_accessor_codegen
+
+from aitemplate.compiler.ops.tensor import concatenate
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
@@ -194,6 +196,7 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
@@ -214,8 +217,12 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  }
+  if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+  }
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
       "Cannot resolve LoadVecType."
@@ -265,7 +272,7 @@
   dim3 grid_config = dim3(static_cast<unsigned>(num_blocks_x), NumInputs);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                        \\
-    case load_vec_type: {                                                   \\
+    if (min_vec_type == load_vec_type) {                                    \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {             \\
          throw std::runtime_error(                                          \\
            std::string("No valid kernel available for ") + #vec_type);      \\
@@ -278,19 +285,21 @@
             concat_dim,                                                     \\
             output_meta.output_strides[concat_dim]);                        \\
       LAUNCH_CHECK_CAT();                                                   \\
-      break;                                                                \\
+      return;                                                               \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    }
+    if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+    }
 
 #undef HANDLE_ONE_VEC_TYPE
+  throw std::runtime_error("Invalid LoadVecType\\n");
 }
 
 #undef CHECK_ERROR_CAT
@@ -436,9 +445,11 @@
     throw std::runtime_error("the number of inputs must >= 1!");
   }
 
+
   for ({{index_type}} i = 0; i < rank; i++) {
     if (i == concat_dim) continue;
     {{index_type}} dim = real_input_shapes[0][i];
+
     for ({{index_type}} j = 1; j < num_real_inputs; j++) {
       if (real_input_shapes[j][i] != dim) {
         throw std::runtime_error(
@@ -614,6 +625,7 @@ def gen_function(
     """
     inputs = func_attrs["inputs"]
     original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
     orig_x = original_inputs[0]
     y = func_attrs["outputs"][0]
     x_shape = orig_x._attrs["shape"]
@@ -725,6 +737,7 @@ def gen_function_call(
         f'{len(inputs)}, {len(input_accessors)}, op: {func_attrs["name"]}'
     )
     original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
     orig_x = original_inputs[0]
     y = func_attrs["outputs"][0]
     concat_dim = func_attrs["concat_dim"]
diff --git a/python/aitemplate/backend/common/elementwise_common.py b/python/aitemplate/backend/common/elementwise_common.py
index 546763505..9d9af7b84 100644
--- a/python/aitemplate/backend/common/elementwise_common.py
+++ b/python/aitemplate/backend/common/elementwise_common.py
@@ -16,16 +16,19 @@
 Backend-agnostic functions for elementwise codegen.
 """
 
+import math
 from dataclasses import dataclass
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import jinja2
 from aitemplate.backend.backend_spec import BackendSpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment as alignment_utils, shape_utils
 
-from ...compiler.base import IntImm, IntVar, Operator, Tensor
-from ...compiler.tensor_accessor import TensorAccessor
-from ...utils import shape_utils
-from . import tensor_accessor_codegen
 
 CONSTANT_TEMPLATE = jinja2.Template(
     """
@@ -64,12 +67,115 @@
 )
 
 
+KERNEL_COMPUTE_IDX_TEMPLATE = jinja2.Template(
+    """
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+    """
+)
+
+
+KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the dense_idx from the blockIdx and threadIdx
+  const {{index_type}} dense_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} dense_idx_elem = dense_idx * N_ELEMENTS_PER_THREAD;
+  if (dense_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the jagged_idx from the dense_idx_elem
+  {{index_type}} jagged_idx;
+  {
+    // dense_coord is along consecutive dense dimensions
+    // jagged_coord is along the total_length of the jagged Tensor
+    {{index_type}} dense_coord = dense_idx_elem / ({{strides[0]}});
+    {{index_type}} running_idx = dense_idx_elem % ({{strides[0]}});
+    {{offsets_type}} jagged_coord = 0, prev_offset, next_offset;
+
+{% for i in range(num_offsets) %}
+    prev_offset = offsets.data[{{i}}][jagged_coord + dense_coord];
+    next_offset = offsets.data[{{i}}][jagged_coord + dense_coord + 1];
+    dense_coord = running_idx / ({{strides[i+1]}});
+    running_idx = running_idx % ({{strides[i+1]}});
+    if (dense_coord >= next_offset - prev_offset) {
+        // this element of the dense volume is
+        // out of bounds of the jagged Tensor
+        {{out_of_bounds_action}}
+        return;
+    }
+    jagged_coord = prev_offset;
+
+{% endfor %}
+    jagged_coord += dense_coord;
+    jagged_idx = (jagged_coord * ({{strides[num_offsets]}}) + running_idx) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
+
+KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE = jinja2.Template(
+    """
+  // first compute the jagged_idx from the blockIdx and threadIdx
+  const {{index_type}} jagged_idx = blockIdx.x * FUSED_ELE_THREAD_SIZE + threadIdx.x;
+  const {{index_type}} jagged_idx_elem = jagged_idx * N_ELEMENTS_PER_THREAD;
+  if (jagged_idx_elem >= n_elements) {
+    return;
+  }
+
+  // then compute the dense_idx from the jagged_idx_elem
+  {{index_type}} dense_idx = jagged_idx_elem % ({{strides[num_offsets]}});
+  {
+    {{offsets_type}} left, right, mid, tmp_value, offset_idx, offset_value;
+    {{index_type}} running_idx = jagged_idx_elem / ({{strides[num_offsets]}});
+
+    // binary search to determine the dense coord along the current jagged dimension
+    // the goal is to find the index of the maximum offset value in offsets.data[{{i}}]
+    // which is <= the running_idx. the (running_idx - offset_value) will then indicate
+    // the dense cooord along the current jagged dimension.
+{% for i in range(num_offsets - 1, -1, -1) %}
+    left = 0;
+    right = offsets.lengths[{{i}}] - 1;
+    while (left <= right) {
+        mid = (left + right) >> 1;
+        tmp_value = offsets.data[{{i}}][mid];
+        if (tmp_value <= running_idx) {
+            offset_idx = mid;
+            offset_value = tmp_value;
+            left = mid + 1;
+        } else {
+            right = mid - 1;
+        }
+    }
+    if (running_idx - offset_value >= (({{strides[i]}}) / ({{strides[i+1]}}))) {
+        // this element of the jagged volume is
+        // out of bounds of the dense Tensor
+        // i.e., the sequence is longer than max_seq_len
+        return;
+    }
+    dense_idx += (running_idx - offset_value) * ({{strides[i+1]}});
+    running_idx = offset_idx;
+
+{% endfor %}
+    dense_idx = (dense_idx + running_idx * ({{strides[0]}})) / N_ELEMENTS_PER_THREAD;
+  }
+    """
+)
+
 KERNEL_READ_INPUT_TEMPLATE = jinja2.Template(
     """
   {{read_t}} *{{input_name}} = const_cast<{{read_t}}*>(input{{input_idx}});
+  constexpr int vec_size{{input_idx}} =  sizeof({{max_read_t}}) / sizeof({{read_t}});
   {{get_strided_address}}
-  {{read_t}} tmp_i{{input_idx}} = *{{input_name}};
-  const {{op_t}}* p_tmp_i{{input_idx}} = reinterpret_cast<const {{op_t}}*>(&tmp_i{{input_idx}});
+  {{read_t}} tmp_i{{input_idx}}[vec_size{{input_idx}}];
+  #pragma unroll
+  for (int i = 0; i < vec_size{{input_idx}}; i++) {
+    tmp_i{{input_idx}}[i] = *{{input_name}};
+  }
+  const {{op_t}}* p_tmp_i{{input_idx}} = reinterpret_cast<const {{op_t}}*>(tmp_i{{input_idx}});
 
     """
 )
@@ -96,14 +202,8 @@
 KERNEL_TEMPLATE = jinja2.Template(
     """
 __global__ void
-{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements) {
-  const int bid = blockIdx.x;
-  const int tid = threadIdx.x;
-  const {{index_type}} idx = bid * FUSED_ELE_THREAD_SIZE + tid;
-  const {{index_type}} idx_elem = idx * N_ELEMENTS_PER_THREAD;
-  if (idx_elem >= n_elements) {
-    return;
-  }
+{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
+  {{compute_idx}}
   {{read_inputs}}
   {{define_outputs}}
 #pragma unroll
@@ -128,19 +228,20 @@
     """
 {{head}}
 
+#include "jagged.h"
+{{custom_libs}}
+
 namespace {
 
 {{constant}}
 
-{{custom_libs}}
-
 {{tensor_accessor_lib}}
 
 {{kernel_function}}
 
 }  // namespace
 
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
     if (n_elements == 0) {
       return;
     }
@@ -149,6 +250,7 @@
         {{kernel_call_output_params}},
         {{kernel_call_input_params}},
         {{dynamic_dims_call}}
+        {{offsets_call}}
         n_elements
     );
 }
@@ -157,7 +259,7 @@
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
-void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
+void invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
     """
 )
 
@@ -165,7 +267,7 @@
     """
 {{indent}}{
     {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
-    {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{func_name}}_n_elements, {{stream}});
+    {{indent}}invoke_{{func_name}}({{output_params}}, {{input_params}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
 {{indent}}}
     """
 )
@@ -192,13 +294,55 @@ class FusedElementwiseMetaData:
     original_inputs: List[Tensor]
     original_outputs: List[Tensor]
 
-    read_t: str
+    # Holds the largest read type for the fused kernel.
+    # This is equivalent to write_t in the current implementation.
+    # This is used to determine N_ELEMENTS_PER_THREAD.
+    max_read_t: str
+
+    # Holds the read_t for each input of the fused kernel.
+    # Note: read_types is only used for a small optimization for last_dim input broadcasting.
+    # General mixed read_types are not supported (which requires multiple get_strided_inputs calls).
+    read_types: List[str]
+
     op_t: str
     data_t: str
+
     input_broadcast_sizes: List[List[IntVar]]
     dynamic_dims: List[IntVar]
     sub_funcs: List[ElementwiseMetaData]
 
+    # this flag specifies if the jagged and mixed inputs need
+    # separate indexing logic within the generated kernel code.
+    # this typically happens when the shape of at least one of
+    # the dense inputs overlaps with one or more jagged dimensions
+    # of the jagged inputs (all jagged inputs are assume to have
+    # the same rank and JaggedIntVar / jagged dimensions).
+    mixed_jagged_dense_indexing: bool = False
+
+    # this attribute is relevant only when mixed_jagged_dense_indexing
+    # is True. it specifies the smallest rectangular volume that fits
+    # all inputs (jagged and dense) and outputs (jagged): i.e., the maximum
+    # rectangular volume that the jagged output Tensor can fit in.
+    # the output_volume list, therefore, can't contain a JaggedIntVar, as
+    # the latter in the jagged output Tensor shape is "expanded" to the
+    # list with `batch_dim` followed by an IntImm for each jagged dim.
+    output_volume: Optional[List[IntVar]] = None
+
+    # this attribute is relevant only when mixed_jagged_dense_indexing
+    # is True. wether the jagged index space implementation (as opposed
+    # to the dense index space implementation) should be use to compute
+    # the dense_idx and jagged_idx separately in the mixed jagged /
+    # dense indexing cases. the dense space indexing runs over the
+    # (dense) output volume and computes jagged_idx from dense_idx.
+    # the jagged space indexing runs over the jagged output shape
+    # and computes the dense_inx from jagged_idx (with binary search).
+    use_jagged_space_indexing: bool = False
+
+    # whether all intermediate computations should be performed in float32
+    use_fp32_acc: bool = False
+    # the float32 type of the used back-end
+    float32_t: str = "float"
+
 
 def gen_function_single_thread(
     fused_func_metadata,
@@ -208,34 +352,44 @@ def gen_function_single_thread(
 ) -> str:
     """Per thread elementwise function codegen."""
     tensor_to_expr: Dict[Tensor, str] = {}
+    float32_t = fused_func_metadata.float32_t
     body = ""
 
     for tensor, name in zip(fused_func_metadata.original_inputs, input_names):
+        if fused_func_metadata.use_fp32_acc and fused_func_metadata.op_t != float32_t:
+            input_converter = type_converter.get(fused_func_metadata.op_t).get(
+                float32_t
+            )
+            name = "{}({})".format(input_converter, name)
         tensor_to_expr[tensor] = name
 
     tmp_output_idx: int = 0
     for func_metadata in fused_func_metadata.sub_funcs:
         params: List[str] = []
-        func_op_t = func_metadata.op_t
         input_converter = None
         output_converter = None
-        if func_op_t != fused_func_metadata.op_t:
-            input_converter = type_converter.get(fused_func_metadata.op_t).get(
-                func_op_t
-            )
-            output_converter = type_converter.get(func_op_t).get(
-                fused_func_metadata.op_t
-            )
-            assert (
-                input_converter is not None
-            ), "Unsupported convertion from {} to {}".format(
-                fused_func_metadata.op_t, func_op_t
-            )
-            assert (
-                output_converter is not None
-            ), "Unsupported convertion from {} to {}".format(
-                func_op_t, fused_func_metadata.op_t
-            )
+        func_op_t = func_metadata.op_t
+
+        # intermediate input / output converters are not
+        # required when doing all computation in float32
+        if not fused_func_metadata.use_fp32_acc:
+            if func_op_t != fused_func_metadata.op_t:
+                input_converter = type_converter.get(fused_func_metadata.op_t).get(
+                    func_op_t
+                )
+                output_converter = type_converter.get(func_op_t).get(
+                    fused_func_metadata.op_t
+                )
+                assert (
+                    input_converter is not None
+                ), "Unsupported convertion from {} to {}".format(
+                    fused_func_metadata.op_t, func_op_t
+                )
+                assert (
+                    output_converter is not None
+                ), "Unsupported convertion from {} to {}".format(
+                    func_op_t, fused_func_metadata.op_t
+                )
 
         for arg in func_metadata.args:
             if arg in tensor_to_expr:
@@ -246,16 +400,21 @@ def gen_function_single_thread(
                     else param
                 )
             elif arg.is_a_const_num():
+                arg_str = ""
+                if math.isinf(arg._attrs["value"]):
+                    arg_str = "CUDART_INF_F"
+                else:
+                    arg_str = str(arg._attrs["value"])
                 if func_op_t[-1] == "2":
                     params.append(
                         "{}({},{})".format(
                             func_op_t,
-                            str(arg._attrs["value"]),
-                            str(arg._attrs["value"]),
+                            arg_str,
+                            arg_str,
                         )
                     )
                 else:
-                    params.append("{}({})".format(func_op_t, str(arg._attrs["value"])))
+                    params.append("{}({})".format(func_op_t, arg_str))
             else:
                 raise RuntimeError(
                     "Cannot generate expression for node {}, ops: {}".format(
@@ -276,7 +435,12 @@ def gen_function_single_thread(
         if len(output._attrs["dst_ops"]) > 1:
             name = "tmp_" + (str)(tmp_output_idx)
             tmp_output_idx += 1
-            body += "{} {} = {};\n".format(fused_func_metadata.op_t, name, func_def)
+            temp_t = (
+                float32_t
+                if fused_func_metadata.use_fp32_acc
+                else fused_func_metadata.op_t
+            )
+            body += "{} {} = {};\n".format(temp_t, name, func_def)
             tensor_to_expr[output] = name
         else:
             tensor_to_expr[output] = func_def
@@ -289,38 +453,57 @@ def gen_function_single_thread(
                 )
             )
         expr = tensor_to_expr[tensor]
+        if fused_func_metadata.use_fp32_acc and fused_func_metadata.op_t != float32_t:
+            output_converter = type_converter.get(float32_t).get(
+                fused_func_metadata.op_t
+            )
+            expr = "{}({})".format(output_converter, expr)
         body += "{} = {};\n".format(name, expr)
 
     return body
 
 
 def _get_sub_func_metadata(
-    ops: List[Operator], data_t: str, op_t: str, backend_spec: BackendSpec
+    ops: List[Operator],
+    data_t: str,
+    op_t: str,
+    backend_spec: BackendSpec,
+    float32_t: str,
 ) -> Tuple[List[ElementwiseMetaData], str]:
-    candidate_op_types = backend_spec.get_candidate_op_types(op_t)
-    func_enums = []
-    for op in ops:
-        func_enum = op._attrs["func"]
-        func_enums.append(func_enum)
-        funcs = backend_spec.func_enum_to_func_name.get(func_enum)
-        if funcs is None:
-            raise NotImplementedError("Func {} is not supported!".format(func_enum))
-        for candidate_op_t in candidate_op_types:
-            func_name = funcs.get(candidate_op_t)
-            if func_name is not None:
-                candidate_op_types = backend_spec.get_candidate_op_types(candidate_op_t)
-                break
-    if len(candidate_op_types) == 0:
-        raise RuntimeError(
-            "Cannot find a common rocm data type! candidate_op_types: {}, op_t: {}.".format(
-                candidate_op_types, op_t
-            )
-        )
-    if op_t in set(candidate_op_types):
-        op_t = candidate_op_types[0]
-    else:
+    use_fp32_acc = Target.current()._kwargs.get("elementwise_use_fp32_acc", False)
+    if use_fp32_acc:
+        # vectorized op types are not allowed when all
+        # intermediate computation is done in float32
         op_t = data_t
+        # only float functions must be used
+        candidate_op_types = [float32_t]
+    else:
         candidate_op_types = backend_spec.get_candidate_op_types(op_t)
+        func_enums = []
+        for op in ops:
+            func_enum = op._attrs["func"]
+            func_enums.append(func_enum)
+            funcs = backend_spec.func_enum_to_func_name.get(func_enum)
+            if funcs is None:
+                raise NotImplementedError("Func {} is not supported!".format(func_enum))
+            for candidate_op_t in candidate_op_types:
+                func_name = funcs.get(candidate_op_t)
+                if func_name is not None:
+                    candidate_op_types = backend_spec.get_candidate_op_types(
+                        candidate_op_t
+                    )
+                    break
+        if len(candidate_op_types) == 0:
+            raise RuntimeError(
+                "Cannot find a common backend data type! candidate_op_types: {}, op_t: {}.".format(
+                    candidate_op_types, op_t
+                )
+            )
+        if op_t in set(candidate_op_types):
+            op_t = candidate_op_types[0]
+        else:
+            op_t = data_t
+            candidate_op_types = backend_spec.get_candidate_op_types(op_t)
 
     sub_func_metadata = []
     for op in ops:
@@ -342,26 +525,100 @@ def _get_sub_func_metadata(
                 func_name, func_op_t, op._attrs["args"], op._attrs["outputs"]
             )
         )
-    return (sub_func_metadata, op_t)
 
+    return sub_func_metadata, op_t, use_fp32_acc
 
-def _get_types_and_sizes(
-    inputs: List[Tensor],
+
+def _is_jagged_shape(shape: List[IntVar]) -> bool:
+    """Whether the given shape is a shape of a jagged Tensor."""
+    return len(shape) > 0 and isinstance(shape[0], JaggedIntVar)
+
+
+def _get_input_alignments(
     input_accessors: List[TensorAccessor],
-    output_accessors: List[TensorAccessor],
-    backend_spec: BackendSpec,
-) -> Tuple[int, List[List[IntVar]], str]:
-    """
-    Returns Tuple(alignment, input_broadcast_sizes, dtype)
-    """
+    input_broadcast_sizes: List[Optional[List[IntVar]]],
+    max_num_rightmost_dims_considered_for_alignments: int,
+    output_shape: List[IntVar],
+    dtype: str,
+    global_max_alignment: int,
+) -> List[int]:
+    # Broadcasts need to be handled carefully.
+    # We have a hacky optimization for last-dim broadcasting:
+    # The element is read once, and broadcasted multiple times.
+    # However, we don't support reading more than 1 element for broadcasting.
+    #
+    # Consider following cases:
+    # X1[2, 1, 1]
+    # X2[1, 1, 2]
+    # X3[1, 2, 1]
+    # We do not support global_max_alignment 8 (reading two X1, X2, X3 per thread).
+    # We only support global_max_alignment 2, so that we make sure each thread
+    # reads at most 1 element for broadcasting.
+
+    # Update global_max_alignment based on broadcasting rules,
+    # and find max_alignments for each input.
+    alignments = [None] * len(input_broadcast_sizes)
+    for i, input_broadcast_size in enumerate(input_broadcast_sizes):
+        if input_broadcast_size is not None:
+            prev_is_broadcast = None
+            for j in range(max_num_rightmost_dims_considered_for_alignments):
+                is_broadcast = input_broadcast_size[-j - 1] != output_shape[-j - 1]
+                if (
+                    not is_broadcast
+                    and input_broadcast_size[-j - 1] == IntImm(1)
+                    and prev_is_broadcast is None
+                ):
+                    # Skip last-dim 1s if the output shape is the same.
+                    is_broadcast = None
+                if prev_is_broadcast is None:
+                    prev_is_broadcast = is_broadcast
+                    if is_broadcast:
+                        # Update alignment for last-dim broadcasting cases.
+                        alignments[i] = 1
+                elif prev_is_broadcast != is_broadcast:
+                    alignment = alignment_utils.find_max_alignment(
+                        shape_utils.get_num_rightmost_static_elements(output_shape, j),
+                        dtype,
+                    )
+                    # Update global_max_alignment when is_broadcast is not the
+                    # same as prev_is_broadcast.
+                    global_max_alignment = min(global_max_alignment, alignment)
+                    if not prev_is_broadcast:
+                        # Update alignment for mid-dim broadcasting cases.
+                        alignments[i] = alignment
+                    break
 
-    # Handle input broadcast.
-    output_shape = output_accessors[0].original_shapes
-    dtype = inputs[0]._attrs["dtype"]
+    # Cap alignments based on global_max_alignment.
+    alignments = [
+        min(alignment, global_max_alignment)
+        if alignment is not None
+        else global_max_alignment
+        for alignment in alignments
+    ]
+    return alignments
+
+
+def _get_input_broadcast_sizes(
+    input_accessors, output_accessors, mixed_jagged_dense_indexing, output_volume
+) -> List[Optional[List[IntVar]]]:
     input_broadcast_sizes = []
-    min_num_elements = None
     for input_accessor in input_accessors:
         input_shape = input_accessor.original_shapes
+
+        if mixed_jagged_dense_indexing:
+            if _is_jagged_shape(input_shape):
+                # broadcast the jagged input shape against the output_shape:
+                # in a mixed jagged / dense op the output_shape is the shape
+                # of the output jagged Tensor
+                output_shape = output_accessors[0].original_shapes
+            else:
+                # broadcast the dense input shape against the output_volume,
+                # as the dense indexing will be done in the output_volume
+                output_shape = output_volume
+        else:
+            # treat all outputs as dense: use output_shape for broadcasting
+            output_shape = output_accessors[0].original_shapes
+
         broadcastable, _ = shape_utils.get_broadcast_max_shape(
             output_shape, input_shape
         )
@@ -371,43 +628,153 @@ def _get_types_and_sizes(
                     input_shape, output_shape
                 )
             )
-        num_rightmost_non_broadcast_elements = len(input_shape)
         extended_input_shape = list(input_shape)
         if input_shape == output_shape:
             input_broadcast_sizes.append(None)
-        else:
+        if input_shape != output_shape:
             extended_input_shape = [IntImm(1)] * len(output_shape)
             extended_input_shape[len(output_shape) - len(input_shape) :] = input_shape
             input_broadcast_sizes.append(extended_input_shape)
-            for i in reversed(range(len(extended_input_shape))):
-                if extended_input_shape[i] != output_shape[i]:
-                    num_rightmost_non_broadcast_elements -= i + 1
-                    break
-        num_elements_for_alignments = shape_utils.get_num_rightmost_static_elements(
-            extended_input_shape, num_rightmost_non_broadcast_elements
-        )
-        if not min_num_elements:
-            min_num_elements = num_elements_for_alignments
-        else:
-            min_num_elements = min(min_num_elements, num_elements_for_alignments)
-    alignment = tensor_accessor_codegen.find_max_alignment(
-        min_num_elements, output_accessors
+    return input_broadcast_sizes
+
+
+def _get_alignments_and_broadcast_sizes(
+    dtype: str,
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+    mixed_jagged_dense_indexing: bool,
+    output_volume: Optional[List[IntVar]],
+) -> Tuple[List[int], List[Optional[List[IntVar]]]]:
+    """
+    Returns Tuple(input_alignments, input_broadcast_sizes)
+    """
+    # Handle input broadcast.
+    output_shape = output_accessors[0].original_shapes
+
+    input_broadcast_sizes = _get_input_broadcast_sizes(
+        input_accessors, output_accessors, mixed_jagged_dense_indexing, output_volume
+    )
+
+    # In the mixed jagged / dense indexing case, the number of the
+    # rightmost non-broadcated static dimensions of the dense inputs
+    # to be considered for vectorization can't be larger than the
+    # number of the jagged output's inner dimensions (i.e., the
+    # dimensions following the JaggedIntVar). Otherwise, there may
+    # be an overlap with the jagged dimensions, in which case the
+    # vectorization can break.
+    max_num_rightmost_dims_considered_for_alignments = (
+        len(output_shape) - 1 if mixed_jagged_dense_indexing else len(output_shape)
+    )
+
+    # We do not support mixed input / output alignments except for last dim broadcast.
+    # The global_max_alignment is the min value of:
+    #     1) input shape alignments (with input broadcast in consideration);
+    #     2) input tensor accessor alignments (strides, offsets);
+    #     3) output shape alignments;
+    #     4) output tensor accessor alignments (strides, offsets);
+
+    # Now calculate global_max_alignment based on 2), 3) and 4) first.
+    global_max_alignment = min(
+        alignment_utils.find_max_alignment(
+            shape_utils.get_num_rightmost_static_elements(
+                output_shape, max_num_rightmost_dims_considered_for_alignments
+            ),
+            dtype,
+        ),
+        tensor_accessor_codegen.find_max_alignment_for_accessors(
+            dtype, input_accessors
+        ),
+        tensor_accessor_codegen.find_max_alignment_for_accessors(
+            dtype, output_accessors
+        ),
     )
-    # Note that we use the same alignment for accessing inputs and outputs, although
-    # they may have different alignment requirements. We may lose perf a little bit,
-    # but reduce the complexity of our jinja template. We can do some perf
-    # experiments later to determine if we want to chase more perf gains.
-    alignment = tensor_accessor_codegen.find_max_alignment(alignment, input_accessors)
-    return alignment, input_broadcast_sizes, dtype
 
+    # Now calculate global_max_alignment based on 1).
+    # Also calculate input alignments.
+    input_alignments = _get_input_alignments(
+        input_accessors,
+        input_broadcast_sizes,
+        max_num_rightmost_dims_considered_for_alignments,
+        output_shape,
+        dtype,
+        global_max_alignment,
+    )
+    return input_alignments, input_broadcast_sizes
 
-def _get_dynamic_dims(output_accessors: List[TensorAccessor]) -> List[IntVar]:
+
+def get_dynamic_dims(*shapes: List[List[IntVar]]) -> List[IntVar]:
     res = {}
-    for output_accessor in output_accessors:
-        for dim in output_accessor.original_shapes:
+    for shape in shapes:
+        for dim in shape:
             if not isinstance(dim, IntImm):
                 res[dim._attrs["name"]] = dim
-    return res.values()
+                if isinstance(dim, JaggedIntVar):
+                    # the batch_dim and the JaggedDim bounds within the JaggedIntVar
+                    # may not be present directly in other input / output shapes,
+                    # so we're adding it here separately
+                    batch_dim = dim.batch_dim()
+                    if not isinstance(batch_dim, IntImm):
+                        res[batch_dim._attrs["name"]] = batch_dim
+                    for jagged_dim in dim.jagged_dims():
+                        min_value = jagged_dim.min_value()
+                        if not isinstance(min_value, IntImm):
+                            res[min_value._attrs["name"]] = min_value
+                        max_value = jagged_dim.max_value()
+                        if not isinstance(max_value, IntImm):
+                            res[max_value._attrs["name"]] = max_value
+
+    return list(res.values())
+
+
+def _get_mixed_jagged_dense_config(
+    input_accessors: List[TensorAccessor],
+    output_accessors: List[TensorAccessor],
+) -> Tuple[bool, List[IntVar], bool]:
+    """
+    Returns Tuple(
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
+    )
+    """
+    # all output shapes are assumed to be the same
+    output_shape = output_accessors[0].original_shapes
+    input_shapes = [acc.original_shapes for acc in input_accessors]
+    jagged_input_shapes = [s for s in input_shapes if _is_jagged_shape(s)]
+    dense_input_shapes = [s for s in input_shapes if not _is_jagged_shape(s)]
+
+    if not jagged_input_shapes or not dense_input_shapes:
+        # there are either only dense inputs or only jagged inputs:
+        # in both cases all inputs will be treated as dense, because
+        # the JaggedIntVars and ranks of all the jagged inputs are
+        # assumed to be the same
+        return False, None, False
+
+    jagged_rank = len(jagged_input_shapes[0])
+    max_dense_rank = max(len(s) for s in dense_input_shapes)
+
+    if max_dense_rank <= jagged_rank - 1:
+        # the longest dense shape does not overlap with the jagged dims:
+        # the jagged inputs can be treated as dense, meaning that the
+        # total_length of the jagged inputs (not overlapping with the
+        # dense inputs' shapes) will be treated as a single dense dim
+        return False, None, False
+
+    jagged_int_var = output_shape[0]
+    jagged_max_dense_prefix_shape = jagged_int_var.get_max_dense_shape()
+    jagged_suffix_shape = output_shape[1:]
+    output_volume = jagged_max_dense_prefix_shape + jagged_suffix_shape
+
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+
+    # because at least one of the dense inputs overlap with the
+    # JaggedIntVar of the jagged inputs, jagged and dense inputs
+    # will need different indexing in the generated kernel.
+    # output_volume is the smallest rectangular volume fitting
+    # all the input (jagged and dense) and outputs (jagged).
+    return True, output_volume, use_jagged_space_indexing
 
 
 def _parse_func_metadata(
@@ -420,20 +787,44 @@ def _parse_func_metadata(
     original_outputs: List[Tensor],
     backend_spec: BackendSpec,
 ) -> FusedElementwiseMetaData:
-    alignment, input_broadcast_sizes, dtype = _get_types_and_sizes(
-        inputs, input_accessors, output_accessors, backend_spec
+    (
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
+    ) = _get_mixed_jagged_dense_config(
+        input_accessors,
+        output_accessors,
     )
-    read_type = backend_spec.get_backend_type(
-        alignment, dtype, backend_spec.read_num_elements_to_backend_type
+    dtype = inputs[0]._attrs["dtype"]
+    (input_alignments, input_broadcast_sizes) = _get_alignments_and_broadcast_sizes(
+        dtype,
+        input_accessors,
+        output_accessors,
+        mixed_jagged_dense_indexing,
+        output_volume,
     )
-    op_type = backend_spec.get_backend_type(
-        alignment, dtype, backend_spec.op_num_elements_to_backend_type
+    max_read_type = backend_spec.get_elementwise_read_backend_type(
+        max(input_alignments), dtype
     )
+    read_types = [
+        backend_spec.get_elementwise_read_backend_type(alignment, dtype)
+        for alignment in input_alignments
+    ]
+
+    # It's safe to use the maximum alignment for determine op_type, because
+    # smaller inputs (i.e. those being broadcasted) will be placed into a
+    # larger tmp variable which is valid for selected op_type.
+    op_type = backend_spec.get_elementwise_op_backend_type(max(input_alignments), dtype)
     data_type = backend_spec.dtype_to_backend_type(dtype)
-    sub_func_metadata, op_type = _get_sub_func_metadata(
-        ops, data_type, op_type, backend_spec
+    float32_type = backend_spec.dtype_to_backend_type("float32")
+    sub_func_metadata, op_type, use_fp32_acc = _get_sub_func_metadata(
+        ops,
+        data_type,
+        op_type,
+        backend_spec,
+        float32_type,
     )
-    dynamic_dims = _get_dynamic_dims(output_accessors)
+    dynamic_dims = get_dynamic_dims(*[acc.original_shapes for acc in output_accessors])
 
     return FusedElementwiseMetaData(
         inputs,
@@ -442,16 +833,22 @@ def _parse_func_metadata(
         output_accessors,
         original_inputs,
         original_outputs,
-        read_type,
+        max_read_type,
+        read_types,
         op_type,
         data_type,
         input_broadcast_sizes,
         dynamic_dims,
         sub_func_metadata,
+        mixed_jagged_dense_indexing,
+        output_volume,
+        use_jagged_space_indexing,
+        use_fp32_acc,
+        float32_type,
     )
 
 
-def _gen_int_var_product_str(
+def gen_int_var_product_str(
     int_vars: List[IntVar],
 ) -> str:
     res = []
@@ -464,12 +861,14 @@ def _gen_int_var_product_str(
             raise RuntimeError(
                 "A dim must be an IntVar! Current type: {}".format(type(int_var))
             )
-    return " * ".join(res)
+
+    return " * ".join(res) if res else "1"
 
 
 def _gen_input_broadcast_calculator_str(
     input_shape: List[IntVar],
     output_shape: List[IntVar],
+    mixed_jagged_dense_indexing: bool,
 ) -> str:
     output_num_elements = []
     output_strides = []
@@ -492,16 +891,21 @@ def _gen_input_broadcast_calculator_str(
         output_strides.append([IntImm(1)])
         output_num_elements.append(output_shape[start_idx:])
 
+    index_variable = "dense_idx"
+    if mixed_jagged_dense_indexing and _is_jagged_shape(input_shape):
+        index_variable = "jagged_idx"
+
     res = []
-    for (output_num_element, output_stride, input_stride) in zip(
+    for output_num_element, output_stride, input_stride in zip(
         output_num_elements, output_strides, input_strides
     ):
+        idx_str = f"{index_variable} * N_ELEMENTS_PER_THREAD"
         res.append(
             "{} % ({}) / ({}) * ({})".format(
-                "idx * N_ELEMENTS_PER_THREAD",
-                _gen_int_var_product_str(output_num_element),
-                _gen_int_var_product_str(output_stride),
-                _gen_int_var_product_str(input_stride),
+                idx_str,
+                gen_int_var_product_str(output_num_element),
+                gen_int_var_product_str(output_stride),
+                gen_int_var_product_str(input_stride),
             )
         )
 
@@ -511,72 +915,190 @@ def _gen_input_broadcast_calculator_str(
 def _gen_input_broadcast_size_str(
     input_broadcast_sizes: List[List[IntVar]],
     output_shape: List[IntVar],
+    mixed_jagged_dense_indexing: bool,
+    output_volume: Optional[List[IntVar]],
 ) -> List[str]:
     res = []
     for input_broadcast_size in input_broadcast_sizes:
         if input_broadcast_size is None:
             res.append("")
         else:
+            if mixed_jagged_dense_indexing:
+                if _is_jagged_shape(input_broadcast_size):
+                    # broadcast the dense input shape in the jagged
+                    # index space: i.e., against the output_shape
+                    output_broadcast_size = output_shape
+                else:
+                    # broadcast the dense input shape in the dense
+                    # index space: i.e., against the output_volume
+                    output_broadcast_size = output_volume
+            else:
+                # broadcast all input shapes in the dense index space
+                # all inputs are treated as dense ==> output_shape
+                output_broadcast_size = output_shape
+
             res.append(
-                _gen_input_broadcast_calculator_str(input_broadcast_size, output_shape)
+                _gen_input_broadcast_calculator_str(
+                    input_broadcast_size,
+                    output_broadcast_size,
+                    mixed_jagged_dense_indexing,
+                )
             )
+
     return res
 
 
-def _gen_dynamic_dim_str(
-    index_type: str, dynamic_dims: List[IntVar], has_type: bool
+def gen_dynamic_dim_str(
+    index_type: str,
+    dynamic_dims: List[IntVar],
+    has_type: bool,
 ) -> str:
     type_str = index_type + " " if has_type else ""
     res = ", ".join([type_str + dim._attrs["name"] for dim in dynamic_dims])
     if res:
         res += ", "
+
     return res
 
 
+def gen_offsets_str(
+    jagged_int_var: JaggedIntVar,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+) -> str:
+    offsets_var_name = jagged_int_var.offsets_var_name()
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    ref_prefix = "const " if const_ref else ""
+    ref_suffix = "&" if const_ref else ""
+    arg_type = f"{ref_prefix}{offsets_struct_type}{ref_suffix} " if has_type else ""
+    arg_name = name if name is not None else offsets_var_name
+    offsets = f"{arg_type}{arg_name}, "
+
+    return offsets
+
+
+def _gen_offsets_str_from_metadata(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    has_type: bool,
+    const_ref: bool,
+    name: Optional[str] = None,
+):
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        inputs = fused_elementwise_metadata.inputs
+        jagged_input = [t for t in inputs if t.is_jagged()][0]
+        jagged_int_var = jagged_input._attrs["shape"][0]
+
+        return gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=has_type,
+            const_ref=const_ref,
+            name=name,
+        )
+    else:
+        return ""
+
+
+def _gen_num_elements_calculator(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+) -> str:
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        if fused_elementwise_metadata.use_jagged_space_indexing:
+            # for the jagged space indexing, the num_elements
+            # is the number of elements in the output jagged Tensor, hence
+            # the usage of the output shape here, not the output volume
+            return gen_int_var_product_str(
+                fused_elementwise_metadata.output_accessors[0].original_shapes,
+            )
+        else:
+            # for the dense space indexing, the num_elements
+            # is the number of elements in the output volume: the smallest
+            # rectangular volume that fits the output jagged Tensor, hence
+            # the usage of the output volume here, not the output shape
+            return gen_int_var_product_str(
+                fused_elementwise_metadata.output_volume,
+            )
+    else:
+        # all inputs and outputs are treated as dense:
+        # use the output shape for computing num_elements
+        return gen_int_var_product_str(
+            fused_elementwise_metadata.output_accessors[0].original_shapes,
+        )
+
+
 def _gen_read_inputs_str(
-    fused_elementwise_metadata: FusedElementwiseMetaData, broadcast_sizes: List[str]
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+    broadcast_sizes: List[str],
 ):
     read_inputs = []
-    for input_idx, (input_accessor, broadcast_size) in enumerate(
-        zip(fused_elementwise_metadata.input_accessors, broadcast_sizes)
+    for input_idx, (input_accessor, read_t, broadcast_size) in enumerate(
+        zip(
+            fused_elementwise_metadata.input_accessors,
+            fused_elementwise_metadata.read_types,
+            broadcast_sizes,
+        )
     ):
+        index_variable = "dense_idx"
+        if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+            input_shape = input_accessor.original_shapes
+            if _is_jagged_shape(input_shape):
+                index_variable = "jagged_idx"
+
         input_name = f"input_tmp{input_idx}"
+
+        # When broadcasting an input, we are reading a different number of elements
+        # from this input based on the "ratio" of its read_t to the max_read_t
+        n_elems_per_thread = (
+            f"(N_ELEMENTS_PER_THREAD / "
+            f"(sizeof({fused_elementwise_metadata.max_read_t}) / sizeof({read_t})))"
+        )
         data_idx = (
-            "idx"
+            index_variable
             if not broadcast_size
-            else f"({broadcast_size}) / N_ELEMENTS_PER_THREAD"
+            else f"({broadcast_size}) / {n_elems_per_thread}"
         )
         get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
             tensor_accessor=input_accessor,
             data_ptr=input_name,
             data_t=fused_elementwise_metadata.data_t,
-            read_t=fused_elementwise_metadata.read_t,
+            read_t=read_t,
             data_idx=data_idx,
         )
         read_input = KERNEL_READ_INPUT_TEMPLATE.render(
             get_strided_address=get_strided_addr_str,
             input_name=input_name,
             input_idx=input_idx,
-            read_t=fused_elementwise_metadata.read_t,
+            max_read_t=fused_elementwise_metadata.max_read_t,
+            read_t=read_t,
             op_t=fused_elementwise_metadata.op_t,
+            data_t=fused_elementwise_metadata.data_t,
         )
         read_inputs.append(read_input)
     read_inputs_str = "\n".join(read_inputs)
     return read_inputs_str
 
 
-def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData):
+def _gen_write_outputs_str(
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+):
     write_outputs = []
     for output_idx, output_accessor in enumerate(
         fused_elementwise_metadata.output_accessors
     ):
+        index_variable = "dense_idx"
+        if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+            # the output of a mixed jagged / dense
+            # elementwise operation is always jagged
+            index_variable = "jagged_idx"
+
         output_name = f"output{output_idx}"
         get_strided_addr_str = GET_STRIDED_ADDRESS_TEMPLATE.render(
             tensor_accessor=output_accessor,
             data_ptr=output_name,
             data_t=fused_elementwise_metadata.data_t,
-            read_t=fused_elementwise_metadata.read_t,
-            data_idx="idx",
+            read_t=fused_elementwise_metadata.max_read_t,
+            data_idx=index_variable,
         )
         write_out = KERNEL_WRITE_OUTPUT_TEMPLATE.render(
             get_strided_address=get_strided_addr_str,
@@ -588,6 +1110,60 @@ def _gen_write_outputs_str(fused_elementwise_metadata: FusedElementwiseMetaData)
     return write_outputs_str
 
 
+def get_stride_expressions(shape: List[IntVar]) -> List[str]:
+    """
+    Generate the stride expressions for each of the dimensions
+    of the shape. A stride expression here means the
+    product of all dimensions following the given dimension.
+    The order of the stride expressions in the returned list
+    is the same as of the dimensions of the shape.
+    """
+    strides = []
+    for dim in reversed(shape[1:]):
+        str_dim = str(dim.value()) if isinstance(dim, IntImm) else dim._attrs["name"]
+        if strides:
+            strides.append(f"{strides[-1]} * {str_dim}")
+        else:
+            strides.append(str_dim)
+    strides.reverse()
+    return strides
+
+
+def _gen_compute_idx(
+    index_type: str,
+    fused_elementwise_metadata: FusedElementwiseMetaData,
+) -> str:
+    if fused_elementwise_metadata.mixed_jagged_dense_indexing:
+        # generate the index computation code computing both
+        # dense_idx and jagged_idx, to be used for the dense
+        # and jagged inputs / outptus, respectively
+        inputs = fused_elementwise_metadata.inputs
+        jagged_input = [t for t in inputs if t.is_jagged()][0]
+        jagged_int_var = jagged_input._attrs["shape"][0]
+        num_offsets = len(jagged_int_var.jagged_dims())
+
+        compute_idx_template = (
+            KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE
+            if fused_elementwise_metadata.use_jagged_space_indexing
+            else KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE
+        )
+
+        return compute_idx_template.render(
+            index_type=index_type,
+            num_offsets=num_offsets,
+            strides=get_stride_expressions(
+                fused_elementwise_metadata.output_volume,
+            ),
+            offsets_type=jagged_int_var.offsets_type(),
+        )
+    else:
+        # no need for the mixed jagged / dense indexing:
+        # use dense_idx for all inputs and outputs
+        return KERNEL_COMPUTE_IDX_TEMPLATE.render(
+            index_type=index_type,
+        )
+
+
 def _gen_kernel_function(
     func_attrs: Dict[str, Any],
     index_type: str,
@@ -597,7 +1173,7 @@ def _gen_kernel_function(
     output_params_decl = ",".join(
         [
             KERNEL_DECL_OUTPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.max_read_t, idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
@@ -605,20 +1181,27 @@ def _gen_kernel_function(
     input_params_decl = ",".join(
         [
             KERNEL_DECL_INPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.read_types[i], idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
     )
 
+    compute_idx_str = _gen_compute_idx(
+        index_type,
+        fused_elementwise_metadata,
+    )
+
     broadcast_sizes = _gen_input_broadcast_size_str(
         fused_elementwise_metadata.input_broadcast_sizes,
         fused_elementwise_metadata.output_accessors[0].original_shapes,
+        fused_elementwise_metadata.mixed_jagged_dense_indexing,
+        fused_elementwise_metadata.output_volume,
     )
     read_inputs_str = _gen_read_inputs_str(fused_elementwise_metadata, broadcast_sizes)
 
     define_outputs = KERNEL_DEFINE_OUTPUTS_TEMPLATE.render(
-        read_t=fused_elementwise_metadata.read_t,
+        read_t=fused_elementwise_metadata.max_read_t,
         op_t=fused_elementwise_metadata.op_t,
         indexes=list(range(len(fused_elementwise_metadata.outputs))),
     )
@@ -644,9 +1227,20 @@ def _gen_kernel_function(
         index_type=index_type,
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims=_gen_dynamic_dim_str(
-            index_type, fused_elementwise_metadata.dynamic_dims, has_type=True
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type,
+            fused_elementwise_metadata.dynamic_dims,
+            has_type=True,
         ),
+        offsets=_gen_offsets_str_from_metadata(
+            fused_elementwise_metadata,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+        compute_idx=compute_idx_str,
         read_inputs=read_inputs_str,
         define_outputs=define_outputs,
         write_outputs=write_outputs_str,
@@ -681,7 +1275,16 @@ def fused_elementwise_gen_function(
         backend_spec,
     )
     # Dump data types into func_attr for testing purpose.
-    func_attrs["read_t"] = fused_elementwise_metadata.read_t
+    func_attrs["max_read_t"] = fused_elementwise_metadata.max_read_t
+    # Fused inputs may not be in the same order as the inputs passed to each
+    # elementwise op, so we save a tuple. Note that this attribute is different
+    # from the read_types field of FusedElementwiseMetaData, where each "read_t"
+    # maps to the input at the same index. The "read_types" attribute is only
+    # used for testing purpose.
+    func_attrs["read_types"] = [
+        (inp._attrs["name"], read_t)
+        for (inp, read_t) in zip(inputs, fused_elementwise_metadata.read_types)
+    ]
     func_attrs["op_t"] = fused_elementwise_metadata.op_t
     func_attrs["data_t"] = fused_elementwise_metadata.data_t
 
@@ -709,7 +1312,7 @@ def fused_elementwise_gen_function(
     kernel_call_output_params = ",".join(
         [
             KERNEL_CALL_OUTPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.max_read_t, idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.outputs)
         ]
@@ -717,13 +1320,13 @@ def fused_elementwise_gen_function(
     kernel_call_input_params = ",".join(
         [
             KERNEL_CALL_INPUT_PARAM_TEMPLATE.render(
-                read_t=fused_elementwise_metadata.read_t, idx=i
+                read_t=fused_elementwise_metadata.read_types[i], idx=i
             )
             for i, _ in enumerate(fused_elementwise_metadata.inputs)
         ]
     )
     constant = CONSTANT_TEMPLATE.render(
-        read_t=fused_elementwise_metadata.read_t,
+        read_t=fused_elementwise_metadata.max_read_t,
         op_t=fused_elementwise_metadata.op_t,
         data_t=fused_elementwise_metadata.data_t,
     )
@@ -739,16 +1342,30 @@ def fused_elementwise_gen_function(
         func_name=func_attrs["name"],
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims_decl=_gen_dynamic_dim_str(
+        dynamic_dims_decl=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
-        dynamic_dims_call=_gen_dynamic_dim_str(
+        dynamic_dims_call=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
+        offsets_decl=_gen_offsets_str_from_metadata(
+            fused_elementwise_metadata,
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=_gen_offsets_str_from_metadata(
+            fused_elementwise_metadata,
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
         kernel_call_output_params=kernel_call_output_params,
         kernel_call_input_params=kernel_call_input_params,
     )
@@ -798,11 +1415,17 @@ def fused_elementwise_gen_function_decl(
         func_name=func_name,
         output_params=output_params_decl,
         input_params=input_params_decl,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=True,
         ),
+        offsets=_gen_offsets_str_from_metadata(
+            fused_elementwise_metadata,
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
     )
     return function_decl
 
@@ -813,6 +1436,7 @@ def fused_elementwise_gen_function_call(
     backend_spec: BackendSpec,
 ):
     """Generates fused_elementwise function call."""
+
     ops = func_attrs["elementwise_ops"]
     inputs = func_attrs["inputs"]
     outputs = func_attrs["outputs"]
@@ -832,24 +1456,26 @@ def fused_elementwise_gen_function_call(
     )
 
     output_params = ",".join([output._attrs["name"] for output in outputs])
-
     input_params = ",".join([input._attrs["name"] for input in inputs])
 
-    num_elements_calculator = _gen_int_var_product_str(
-        output_accessors[0].original_shapes
-    )
-
     return FUNC_CALL_TEMPLATE.render(
         stream=backend_spec.stream,
         func_name=func_attrs["name"],
         index_type=backend_spec.index_type,
-        calculate_n=num_elements_calculator,
+        calculate_n=_gen_num_elements_calculator(
+            fused_elementwise_metadata,
+        ),
         output_params=output_params,
         input_params=input_params,
-        dynamic_dims=_gen_dynamic_dim_str(
+        dynamic_dims=gen_dynamic_dim_str(
             backend_spec.index_type,
             fused_elementwise_metadata.dynamic_dims,
             has_type=False,
         ),
+        offsets=_gen_offsets_str_from_metadata(
+            fused_elementwise_metadata,
+            has_type=False,
+            const_ref=False,
+        ),
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/split_common.py b/python/aitemplate/backend/common/split_common.py
index a1dbaa930..694b401e4 100644
--- a/python/aitemplate/backend/common/split_common.py
+++ b/python/aitemplate/backend/common/split_common.py
@@ -22,9 +22,11 @@
 void {{func_name}}(
     void *[] /*outputs*/,
     {{index_type}} **[] /*output_shapes*/,
+    const bool [] /*output_masks*/,
     const void * /*input*/,
     const {{index_type}} * /*input_shape*/,
-    {{index_type}} /*num_splits*/,
+    {{index_type}} /*real_num_splits*/,
+    {{index_type}} /*all_num_splits*/,
     {{index_type}} [] /*split_sizes*/,
     {{index_type}} /*split_dim*/,
     {{index_type}} /*rank*/,
@@ -158,6 +160,7 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
@@ -185,7 +188,11 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+  }
 
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
@@ -198,9 +205,11 @@
 void split_kernel_launcher(
     void *outputs[],
     {{index_type}} *output_shapes[],
+    const bool output_masks[],
     const void *input,
     const {{index_type}} *input_shape,
     const {{index_type}} split_dim,
+    const {{index_type}} split_sizes[],
     {{prefix}}Stream_t stream
 ) {
 
@@ -215,13 +224,19 @@
 
   OutputMetaData<ELEM_T, NumSplits> output_meta;
   {{index_type}} offset = 0;
+  {{index_type}} split_sizes_idx = 0;
   LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
   for ({{index_type}} i = 0; i < NumSplits; i++) {
+    while (!output_masks[split_sizes_idx]) {
+      offset += split_sizes[split_sizes_idx];
+      split_sizes_idx++;
+    }
     output_meta.outputs[i] = static_cast<ELEM_T*>(outputs[i]);
     output_meta.split_dim_offsets[i] = offset;
     output_meta.split_dim_sizes[i] = output_shapes[i][split_dim];
     output_meta.num_elems[i] = get_num_elems(output_shapes[i], Rank);
     offset += output_meta.split_dim_sizes[i];
+    split_sizes_idx++;
     LoadVecType vec_type =
         get_vec_type<ELEM_T>(output_shapes[i], Rank, split_dim);
     min_vec_type = vec_type < min_vec_type ? vec_type : min_vec_type;
@@ -239,7 +254,7 @@
   dim3 grid_config = dim3(num_blocks_x, NumSplits);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                   \\
-    case load_vec_type: {                                              \\
+    if (min_vec_type == load_vec_type) {                               \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {        \\
          throw std::runtime_error(                                     \\
            std::string("No valid kernel available for ") + #vec_type); \\
@@ -252,18 +267,19 @@
             split_dim,                                                 \\
             input_meta.input_strides[split_dim]);                      \\
       LAUNCH_CHECK_SPLIT();                                            \\
-      break;                                                           \\
+      return;                                                          \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+    }
 
+  throw std::runtime_error("Invalid LoadVecType\\n");
 #undef HANDLE_ONE_VEC_TYPE
 }
 
@@ -276,29 +292,30 @@
 
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
-{{indent}}if (rank == {{rank}} && num_splits == {{num_splits}}) {
-{% for split_idx in range(num_splits) %}
-{{indent}}  {{index_type}} local_shape{{split_idx}}[{{rank}}];
+{{indent}}if (rank == {{rank}} && real_num_splits == {{real_num_splits}}) {
+{% for split_idx in split_indices %}
+{% set outer_loop = loop %}
+{{indent}}  {{index_type}} local_shape{{outer_loop.index0}}[{{rank}}];
 {% for rank_idx in range(rank) %}
-{{indent}}  local_shape{{split_idx}}[{{rank_idx}}] = input_shape[{{rank_idx}}];
+{{indent}}  local_shape{{outer_loop.index0}}[{{rank_idx}}] = input_shape[{{rank_idx}}];
 {% endfor %}
-{{indent}}  local_shape{{split_idx}}[split_dim] = split_sizes[{{split_idx}}];
+{{indent}}  local_shape{{outer_loop.index0}}[split_dim] = split_sizes[{{split_idx}}];
 
 {% endfor %}
 
-{{indent}}  {{index_type}}* local_output_shapes[{{num_splits}}] = {
-{% for idx in range(num_splits - 1) %}
+{{indent}}  {{index_type}}* local_output_shapes[{{real_num_splits}}] = {
+{% for idx in range(real_num_splits - 1) %}
 {{indent}}    local_shape{{idx}},
 {% endfor %}
-{{indent}}    local_shape{{num_splits - 1}}
+{{indent}}    local_shape{{real_num_splits - 1}}
 {{indent}}  };
 {{indent}}  /* TODO: more profiling on ElemsPerThread and ThreadsPerBlock */
 {{indent}}  split_kernel_launcher<{{elem_type}},
 {{indent}}                        {{rank}}/*Rank*/,
-{{indent}}                        {{num_splits}}/*NumSplits*/,
+{{indent}}                        {{real_num_splits}}/*NumSplits*/,
 {{indent}}                        {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                        {{threads_per_block}}/*THREADS_PER_BLOCK*/>(
-{{indent}}      outputs, local_output_shapes, input, input_shape, split_dim, stream);
+{{indent}}      outputs, local_output_shapes, output_masks, input, input_shape, split_dim, split_sizes, stream);
 {{indent}}  return;
 {{indent}}}
 """
@@ -311,9 +328,11 @@
 void {{func_name}}(
     void* outputs[],
     {{index_type}} **output_shapes[],
+    const bool output_masks[],
     const void* input,
     const {{index_type}} *input_shape,
-    {{index_type}} num_splits,
+    {{index_type}} real_num_splits,
+    {{index_type}} all_num_splits,
     {{index_type}} split_sizes[],
     {{index_type}} split_dim,
     {{index_type}} rank,
@@ -326,23 +345,28 @@
   if (split_dim >= rank) {
     throw std::runtime_error("cat_dim must be smaller than rank!");
   }
-  if (num_splits < 1) {
+  if (real_num_splits < 1) {
     throw std::runtime_error("the number of splits must be larger than 0!");
   }
 
   // now we update the shape for each output
-  for ({{index_type}} i = 0; i < num_splits; i++) {
-    {{index_type}} **shape_ptr = output_shapes[i];
+  {{index_type}} real_idx = 0;
+  for ({{index_type}} i = 0; i < all_num_splits; i++) {
+    if (!output_masks[i]) {
+      continue;
+    }
+    {{index_type}} **shape_ptr = output_shapes[real_idx];
     for ({{index_type}} dim_idx = 0; dim_idx < rank; dim_idx++) {
       *(shape_ptr[dim_idx]) = input_shape[dim_idx];
     }
     // update dim size for the split axis
     *(shape_ptr[split_dim]) = split_sizes[i];
+    real_idx++;
   }
 
   {{index_type}} split_dim_size = input_shape[split_dim];
   {{index_type}} sum_of_split_sizes = 0;
-  for ({{index_type}} i = 0; i < num_splits; i++) {
+  for ({{index_type}} i = 0; i < all_num_splits; i++) {
     sum_of_split_sizes += split_sizes[i];
   }
   if (split_dim_size != sum_of_split_sizes) {
@@ -361,7 +385,7 @@
   if (!input) {
     throw std::runtime_error("input is NULL!");
   }
-  for (int i = 0; i < num_splits; i++) {
+  for (int i = 0; i < real_num_splits; i++) {
     if (!outputs[i]) {
       throw std::runtime_error("NULL output found at: " + std::to_string(i));
     }
@@ -370,7 +394,7 @@
 {{exec_paths}}
 
   throw std::runtime_error(
-      "Unsupported cat kernel specialization!"
+      "Unsupported split kernel specialization!"
   );
 }
 """
@@ -408,12 +432,18 @@
 {{indent}}    {{split_sizes}}
 {{indent}}  };
 
+{{indent}}  bool output_masks[] = {
+{{indent}}    {{output_masks}}
+{{indent}}  };
+
 {{indent}}  {{func_name}}(
 {{indent}}      outputs,
 {{indent}}      output_shapes,
+{{indent}}      output_masks,
 {{indent}}      {{input_ptr}},
 {{indent}}      {{input_name}}_shape,
-{{indent}}      {{num_splits}}/*num_splits*/,
+{{indent}}      {{real_num_splits}}/*real_num_splits*/,
+{{indent}}      {{all_num_splits}}/*all_num_splits*/,
 {{indent}}      split_sizes,
 {{indent}}      {{split_dim}}/*split_dim*/,
 {{indent}}      {{rank}}/*rank*/,
@@ -469,12 +499,15 @@ def gen_function(func_attrs, backend_spec):
     if input_type != output_type:
         raise NotImplementedError("input type must equal to output type")
 
+    split_indices = [idx for idx, mask in enumerate(func_attrs["output_masks"]) if mask]
+
     # TODO: consider to add profiling paths for tuning
     # elems_per_thread and threads_per_block
     exec_paths = EXEC_COND_TEMPLATE.render(
         indent="  ",
         rank=len(x_shape),
-        num_splits=len(func_attrs["split_sizes"]),
+        real_num_splits=len(func_attrs["outputs"]),
+        split_indices=split_indices,
         elem_type=input_type,
         elems_per_thread=128,
         threads_per_block=128,
@@ -513,7 +546,7 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     x = func_attrs["inputs"][0]
     outputs = func_attrs["outputs"]
     split_dim = func_attrs["split_dim"]
-    num_splits = len(func_attrs["split_sizes"])
+    num_splits = len(func_attrs["outputs"])
 
     output_names = ",\n      ".join([i._attrs["name"] for i in outputs])
 
@@ -539,16 +572,23 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
 
     split_sizes = ", ".join([str(i) for i in func_attrs["split_sizes"]])
 
+    output_masks_str = ", ".join(
+        ["true" if mask is True else "false" for mask in func_attrs["output_masks"]]
+    )
+
     return FUNC_CALL_TEMPLATE.render(
         indent=indent,
         outputs=output_names,
         output_shape_defs="".join(output_shape_defs),
         output_shapes=", ".join(output_shape_names),
+        output_masks=output_masks_str,
         input_dims=x_dims,
         func_name=func_attrs["name"],
         input_name=x._attrs["name"],
         input_ptr=x._attrs["name"],
         split_dim=split_dim,
+        real_num_splits=len(func_attrs["outputs"]),
+        all_num_splits=len(func_attrs["output_masks"]),
         rank=len(x._attrs["shape"]),
         num_splits=num_splits,
         split_sizes=split_sizes,
diff --git a/python/aitemplate/backend/common/tensor/argmax_common.py b/python/aitemplate/backend/common/tensor/argmax_common.py
index 67c3d4b94..2bc8b0038 100644
--- a/python/aitemplate/backend/common/tensor/argmax_common.py
+++ b/python/aitemplate/backend/common/tensor/argmax_common.py
@@ -42,7 +42,7 @@
 {{func_signature}}
 {
 
-    argmax_launcher<half>(stream, elem_cnt, instance_size, instance_num, input, workspace, output);
+    argmax_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, input, workspace, output);
 }
     """
 )
@@ -247,8 +247,8 @@ class MultiplyFunctor final {
   int instance_num = std::stoi(argv[2]);
 
   float runtime_ms = 0;
-  int32_t key_value_out_bytes = GetAlignedSize(instance_num * sizeof({{cub}}::KeyValuePair<int32_t, half>));
-  size_t temp_storage_bytes = InferTempStorageForArgMax<half>(instance_num, instance_size);
+  int32_t key_value_out_bytes = GetAlignedSize(instance_num * sizeof({{cub}}::KeyValuePair<int32_t, {{dtype}}>));
+  size_t temp_storage_bytes = InferTempStorageForArgMax<{{dtype}}>(instance_num, instance_size);
   GLOBAL_WORKSPACE_SIZE  =  GetAlignedSize(key_value_out_bytes + temp_storage_bytes);
 
   std::cout << "TIME:" << runtime_ms << std::endl;
@@ -260,7 +260,7 @@ class MultiplyFunctor final {
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(int64_t* output,
-                   const half* input,
+                   const void* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
                    const {{index_type}} instance_num,
@@ -308,14 +308,21 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
     cub = backend_spec.cub
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     return FUNC_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
-            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+            func_name=func_attrs["name"],
+            index_type=index_type,
+            prefix=prefix,
+            dtype=dtype,
         ),
         kernel=KERNEL_TEMPLATE.render(
             cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
         ),
+        dtype=dtype,
     )
 
 
@@ -334,11 +341,14 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     str
         Rendered function decl.
     """
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     return FUNC_DECL.render(
         func_signature=FUNC_SIGNATURE.render(
             func_name=func_attrs["name"],
             index_type=backend_spec.index_type,
             prefix=backend_spec.prefix,
+            dtype=dtype,
         ),
     ).strip()
 
@@ -364,11 +374,14 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     assert len(func_attrs["outputs"]) == 1
     assert len(func_attrs["inputs"]) == 1
 
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
         name=func_attrs["outputs"][0]._attrs["name"]
     )
-    input_name = backend_spec.cast_to_half_ptr_template.render(
-        name=func_attrs["inputs"][0]._attrs["name"]
+    input_name = backend_spec.cast_to_ptr_template.render(
+        name=func_attrs["inputs"][0]._attrs["name"],
+        dtype=dtype,
     )
 
     x = func_attrs["inputs"][0]
@@ -435,15 +448,22 @@ def gen_profiler(
     index_type = backend_spec.index_type
     prefix = backend_spec.prefix
     cub = backend_spec.cub
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
     code = PROFILER_TEMPLATE.render(
         header_files=header_files,
         func_signature=FUNC_SIGNATURE.render(
-            func_name=func_attrs["name"], index_type=index_type, prefix=prefix
+            func_name=func_attrs["name"],
+            index_type=index_type,
+            prefix=prefix,
+            dtype=dtype,
         ),
         kernel=KERNEL_TEMPLATE.render(
             cub=cub, index_type=index_type, prefix=prefix, is_hipcub=(cub == "hipcub")
         ),
         cub=cub,
+        dtype=dtype,
     )
     op_name = func_attrs["op"]
     add_profiler(file_pairs, workdir, op_type, op_name, code)
diff --git a/python/aitemplate/backend/common/tensor/batch_gather_common.py b/python/aitemplate/backend/common/tensor/batch_gather_common.py
index 97e8aee77..df8b5e935 100644
--- a/python/aitemplate/backend/common/tensor/batch_gather_common.py
+++ b/python/aitemplate/backend/common/tensor/batch_gather_common.py
@@ -36,7 +36,8 @@
 
 {{func_signature}}
 {
-    batch_gather_launcher<{{dtype}}, int64_t>(stream, batch_num, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
+    const int64_t gather_size = (gather_dim != 0) ? (*batch_size * batch_num) : batch_num;
+    batch_gather_launcher<{{dtype}}, int64_t>(stream, gather_size, indices_num, instance_size, gather_dim_size, static_cast<const {{dtype}}*>(input), indices, workspace, static_cast<{{dtype}}*>(output));
 }
     """
 )
@@ -46,9 +47,11 @@
 void {{func_name}}(void* output,
                    const void* input,
                    const int64_t* indices,
+                   const {{index_type}}* batch_size,
                    const {{index_type}} batch_num,
                    const {{index_type}} indices_num,
                    const {{index_type}} instance_size,
+                   const {{index_type}} gather_dim,
                    const {{index_type}} gather_dim_size,
                    uint8_t* workspace,
                    {{prefix}}Stream_t stream)
@@ -65,9 +68,11 @@
     """
 {{indent}}{{func_name}}(
 {{indent}}   {{output}}, {{input}}, {{indices}},
+{{indent}}    {{batch_size}},
 {{indent}}    {{batch_num}},
 {{indent}}    {{indices_num}},
 {{indent}}    {{instance_size}},
+{{indent}}    {{gather_dim}},
 {{indent}}    {{gather_dim_size}},
 {{indent}}    global_workspace_, stream /* default stream */
 {{indent}});
@@ -168,7 +173,7 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
 
     axis = len(ind_shape) - 1
     batch_num = 1
-    for i in range(axis):
+    for i in range(1, axis):
         batch_num *= yshape[i]._attrs["values"][0]
 
     indices_num = yshape[axis]._attrs["values"][0]
@@ -184,9 +189,11 @@ def gen_function_call(func_attrs: Dict[str, Any], indent="  ", is_cuda=False) ->
         output=output_name,
         input=input_name,
         indices=indices_name,
+        batch_size="&" + xshape[0]._attrs["name"],
         batch_num=batch_num,
         indices_num=indices_num,
         instance_size=instance_size,
+        gather_dim=axis,
         gather_dim_size=gather_dim_size,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/identity_common.py b/python/aitemplate/backend/common/tensor/identity_common.py
new file mode 100644
index 000000000..94b2e8632
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/identity_common.py
@@ -0,0 +1,157 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+identity kernel codegen.
+"""
+
+from typing import Any, Dict
+
+import jinja2
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.dtype import get_dtype_size
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{extra_headers}}
+{{func_signature}}
+{
+{% if is_copy %}
+    {{prefix}}MemcpyAsync(*output, input, size, {{prefix}}MemcpyDeviceToDevice, stream);
+{% else %}
+    *output = input;
+{% endif %}
+}
+    """
+)
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void** output, void* input, size_t size, {{prefix}}Stream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   &{{output}},
+{{indent}}   {{input}},
+{{indent}}   {{size}},
+{{indent}}   stream
+{{indent}});
+    """
+)
+
+
+def gen_function(func_attrs: Dict[str, Any], backend_spec, extra_headers="") -> str:
+    """Generates function.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    header_files : str
+        Includes the header files for a backend.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function.
+    """
+    is_copy = func_attrs["outputs"][0]._attrs["is_output"]
+
+    return FUNC_TEMPLATE.render(
+        extra_headers=extra_headers,
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+        ),
+        prefix=backend_spec.prefix,
+        is_copy=is_copy,
+    )
+
+
+def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+    """Generates function decl.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+
+    Returns
+    -------
+    str
+        Rendered function decl.
+    """
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(
+            func_name=func_attrs["name"],
+            prefix=backend_spec.prefix,
+        ),
+    ).strip()
+
+
+def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    backend_spec : class
+        Specifies the backend configurations.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 1
+
+    input_name = func_attrs["inputs"][0]._attrs["name"]
+
+    output_node = func_attrs["outputs"][0]
+    output_name = output_node._attrs["name"]
+    shape = ["1"]
+    for dim in output_node._attrs["shape"]:
+        if isinstance(dim, IntImm):
+            shape.append(str(dim._attrs["values"][0]))
+        else:
+            shape.append(dim._attrs["name"])
+    shape = "*".join(shape)
+    size = f"{shape} * {get_dtype_size(output_node._attrs['dtype'])}"
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        input=input_name,
+        size=size,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute0213_common.py b/python/aitemplate/backend/common/tensor/permute0213_common.py
new file mode 100644
index 000000000..d4422fcff
--- /dev/null
+++ b/python/aitemplate/backend/common/tensor/permute0213_common.py
@@ -0,0 +1,488 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common implementations for all backends for permute0213.
+
+This implementation is based on the permute102 implementation in
+permute102_common.py. The difference is that, in this implementation,
+the permute102 logic is applied to each slice along the batch
+dimension of the 4d input tensor. To this end, the batch dimension
+is added as a blockIdx.z for the tiled kernel launch and encoded
+in the blockIdx.z for the direct kernel launch. The input and output
+pointers are shifted accordingly in the kernel code.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* /* input */,
+  void* /* output */,
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
+  int64_t /* x_dim3 */,
+  {{prefix}}Stream_t /* stream */
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{x_dim0}},
+{{indent}}    {{x_dim1}},
+{{indent}}    {{x_dim2}},
+{{indent}}    {{x_dim3}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{% if dtype == "half" %}
+{{indent}}if (x_dim3 % 8 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<half>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "float" %}
+{{indent}}if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "bfloat16" %}
+{{indent}}if (x_dim3 % 8 == 0) {
+{{indent}}  permute0213_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 4 == 0) {
+{{indent}}  permute0213_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim3 % 2 == 0) {
+{{indent}}  permute0213_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute0213_launcher<bfloat16>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      x_dim3,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% endif %}
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+#define TILE_SIZE {{warp_size}}
+#define ITEMS_PER_THREAD 4
+#define DIRECT_BLOCK_Y 4
+#define DIRECT_BLOCK_Z 2
+
+namespace {
+
+template<typename T>
+__global__ void permute0213_tiled_kernel(T* output,
+                                         const T *input,
+                                         const int M,
+                                         const int N,
+                                         const int D,
+                                         const int n) {
+  __shared__ T shbuf[TILE_SIZE * TILE_SIZE];
+
+  const int nD = n * D;
+  const int ND = N * D;
+  const int MD = M * D;
+  const int bxn = blockIdx.x * n;
+  const int DT = D * TILE_SIZE;
+  int x, y, i, tid, threadIdxY;
+
+  int offset = blockIdx.z * M * N * D;
+  input += offset;
+  output += offset;
+
+  if (threadIdx.x < nD) {
+    x = blockIdx.x * nD + threadIdx.x;
+    if (x < ND) {
+      threadIdxY = threadIdx.y;
+      if ((blockIdx.y + 1) * TILE_SIZE <= M) {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      } else {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          if (y >= M) break;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  threadIdxY = threadIdx.y;
+  if ((blockIdx.x + 1) * n <= N) {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        output[(bxn + y) * MD + blockIdx.y * DT + x] =
+          shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  } else {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N && blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void permute0213_direct_kernel(T* output,
+                                          const T *input,
+                                          const int M,
+                                          const int N,
+                                          const int D,
+                                          const int m) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x < D && y < N) {
+    int zi = blockIdx.z % m;
+
+    int offset = (blockIdx.z / m) * M * N * D;
+    input += offset;
+    output += offset;
+
+    int bound = min(M, (zi + 1) * TILE_SIZE);
+    for (int z = zi * TILE_SIZE + threadIdx.z; z < bound; z += DIRECT_BLOCK_Z) {
+      output[y * M * D + z * D + x] = input[z * N * D + y * D + x];
+    }
+  }
+}
+
+template <typename T>
+void permute0213_launcher(const void* in_ptr,
+                          void* out_ptr,
+                          int x_dim0,
+                          int x_dim1,
+                          int x_dim2,
+                          int x_dim3,
+                          {{prefix}}Stream_t stream) {
+  const int B = x_dim0;
+  const int M = x_dim1;
+  const int N = x_dim2;
+  const int D = x_dim3;
+
+  if (D <= 16) {
+    // each warp reads n x d coalesced items of input
+    const int d = min(TILE_SIZE, D);
+    const int n = TILE_SIZE / d;
+
+    dim3 grid((N + n - 1) / n, (M + TILE_SIZE - 1) / TILE_SIZE, B);
+    dim3 block(TILE_SIZE, TILE_SIZE / ITEMS_PER_THREAD);
+
+    permute0213_tiled_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      n
+    );
+  } else {
+    const int m = ((M + TILE_SIZE - 1) / TILE_SIZE);
+
+    dim3 grid((D + TILE_SIZE - 1) / TILE_SIZE, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, B * m);
+    dim3 block(TILE_SIZE, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = TILE_SIZE, the warp size
+
+    permute0213_direct_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      m
+    );
+  }
+}
+} // namespace
+
+void {{function_name}} (
+    const void* in_ptr,
+    void* out_ptr,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
+    int64_t x_dim3,
+    {{prefix}}Stream_t stream
+) {
+  if (!in_ptr) {
+    throw std::runtime_error("in_ptr is NULL!");
+  }
+  if (!out_ptr) {
+    throw std::runtime_error("out_ptr is NULL!");
+  }
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+def gen_function(
+    func_attrs: Dict[str, Any],
+    template_path: str,
+    header_files: str,
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+    header_files : str
+        header files included in the function
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    func_name = func_attrs["name"]
+    x = func_attrs["inputs"][0]
+    xdtype = x._attrs["dtype"]
+    exec_paths = EXEC_TEMPLATE.render(
+        indent="  ",
+        dtype=backend_spec.dtype_to_backend_type(xdtype),
+    )
+    return SRC_TEMPLATE.render(
+        warp_size=backend_spec.warp_size,
+        function_name=func_name,
+        exec_paths=exec_paths,
+        header_files=header_files,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        prefix=backend_spec.prefix,
+    )
+
+
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    backend_spec : class
+        specifies backend configs
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
+        x_dim3=xshape[3]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/common/tensor/permute021_common.py b/python/aitemplate/backend/common/tensor/permute021_common.py
index 30ab97b80..b2dc5bfdb 100644
--- a/python/aitemplate/backend/common/tensor/permute021_common.py
+++ b/python/aitemplate/backend/common/tensor/permute021_common.py
@@ -17,25 +17,24 @@
 
 For three dimension input, shift the second and the third dimension.
 i.e. Output[d0, d2, d1] = Input[d0, d1, d2]
+For higher-rank input, treat the first n-2 dims as a single flat dim.
+i.e. Output[d0, ..., dn-3, dn-1, dn-2] = Input[d0, ..., dn-3, dn-2, dn-1]
 
 """
 from typing import Any, Dict
 
 import jinja2
+from aitemplate.backend.common import tensor_accessor_codegen
 
 # pylint: disable=C0301,W0613,W0612
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  const void* /*input*/,
+  const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
+  int64_t /* rank */,
+  const int64_t* /* x_dims */,
   {{prefix}}Stream_t /* stream */
 );
 """
@@ -43,29 +42,29 @@
 
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{x_dim0}},
-{{indent}}    {{x_dim1}},
-{{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
-{{indent}}    stream
-{{indent}});
+{{indent}}{
+{{indent}}  const int64_t x_dims[] = {{x_dims}};
+{{indent}}  {{func_name}}(
+{{indent}}      {{in_ptr}},
+{{indent}}      {{out_ptr}},
+{{indent}}      {{rank}},
+{{indent}}      x_dims,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
 """
 )
 
 
 EXEC_TEMPLATE = jinja2.Template(
     """
+{{input_accessor_def}}
 {{indent}}permute021_launcher(
 {{indent}}    in_ptr,
 {{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
+{{indent}}    rank,
+{{indent}}    x_dims,
+{{indent}}    input_accessor,
 {{indent}}    stream
 {{indent}});
 {{indent}}return;
@@ -76,43 +75,56 @@
     """
 {{header_files}}
 
+#include <limits>
+
+#define TILE_SIZE 32
+#define CH_K 4
+
 namespace {
+
+{{tensor_accessor_libs}}
+
 template <typename T>
-__global__ void nhwc_to_nchw_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-
-  const int hw = h*w;
-  const int hwc = hw*c;
-  __shared__ T shbuf[32 * (32 + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / 32;
-  const int32_t lid  = tid % 32;
+__global__ void permute021_kernel(T *output,
+                                  const T *input,
+                                  const int64_t n,
+                                  const int32_t h,
+                                  const int32_t w,
+                                  const int32_t c,
+                                  TensorAccessor input_accessor) {
+
+  const int32_t hw = h * w;
+  const int32_t hwc = hw * c;
+
+  __shared__ T shbuf[TILE_SIZE * (TILE_SIZE + 1)];
+
+  const int32_t tid  = threadIdx.y * blockDim.x + threadIdx.x;
+  const int32_t wid  = tid / TILE_SIZE;
+  const int32_t lid  = tid % TILE_SIZE;
   const int32_t ni   = blockIdx.z;
-  const int32_t hwi0  = blockIdx.y * 32;
-  const int32_t ci0 = blockIdx.x * 32;
+  const int32_t hwi0 = blockIdx.y * TILE_SIZE;
+  const int32_t ci0  = blockIdx.x * TILE_SIZE;
+
+  size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
+
+  const T *A = input_accessor.get<const T, const T>(input, input_idx);
 
-  const size_t input_idx = ni * hwc + (hwi0 + wid) * c + ci0;
-  const T *A = input + input_idx;
   if (ci0 + lid < c) {
-    const int lid_x_33 = lid * 33;
-    if ((hwi0 + 32) <= hw) {
+    const int lid_x_33 = lid * (TILE_SIZE + 1);
+    if ((hwi0 + TILE_SIZE) <= hw) {
       int hwi = wid;  // between 0 and 7
       #pragma unroll
-      for (int cLoopIdx = 0; cLoopIdx < 4; cLoopIdx++) {
-        shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[8 * c];
-        hwi += 8;
+      for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
+        shbuf[lid_x_33 + hwi] = *input_accessor.get<const T, const T>(input, input_idx + lid);
+        input_idx += TILE_SIZE / CH_K * c;
+        hwi += TILE_SIZE / CH_K;
       }
     } else {
-      for (int hwi = wid; hwi < 32; hwi += 8) {
-        if ((hwi + hwi0) < hw) {
-          shbuf[lid_x_33 + hwi] = A[lid];
+      for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
+        if (hwi + hwi0 < hw) {
+          shbuf[lid_x_33 + hwi] = *input_accessor.get<const T, const T>(input, input_idx + lid);
         }
-        A = &A[8 * c];
+        input_idx += TILE_SIZE / CH_K * c;
       }
     }
   }
@@ -121,17 +133,17 @@
   const int32_t hwiOut = hwi0 + lid;
   output = &output[ni * hwc + hwiOut];
   if (hwiOut < hw) {
-    if (ci0 + 32 < c) {
+    if (ci0 + TILE_SIZE < c) {
       int cI = wid;
       #pragma unroll
-      for (int hwLoopIdx = 0; hwLoopIdx < 4; ++hwLoopIdx) {
-        output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
-        cI += 8;
+      for (int hwLoopIdx = 0; hwLoopIdx < CH_K; ++hwLoopIdx) {
+        output[(ci0 + cI) * hw] = shbuf[cI * (TILE_SIZE + 1) + lid];
+        cI += TILE_SIZE / CH_K;
       }
     } else {
-      for (int cI = wid; cI < 32; cI += 8) {
+      for (int cI = wid; cI < TILE_SIZE; cI += TILE_SIZE / CH_K) {
         if (ci0 + cI < c) {
-          output[(ci0 + cI) * hw] = shbuf[(cI)*33 + lid];
+          output[(ci0 + cI) * hw] = shbuf[cI * (TILE_SIZE + 1) + lid];
         }
       }
     }
@@ -140,23 +152,40 @@
 
 void permute021_launcher(const void* in_ptr,
                          void* out_ptr,
-                         int x_dim0,
-                         int x_dim1,
-                         int x_dim2,
+                         int64_t rank,
+                         const int64_t* x_dims,
+                         TensorAccessor input_accessor,
                          {{prefix}}Stream_t stream) {
-  const int n = x_dim0;
-  const int h = 1;
-  const int w = x_dim1;
-  const int c = x_dim2;
-  dim3 grid((c + 31)/32, (h*w + 31)/32, n);
-  dim3 block(32, 8);
-  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
+  int64_t x_dim0 = 1;
+  for (int i = 0; i < rank - 2; i++) {
+    x_dim0 *= x_dims[i];
+  }
+
+  if (x_dims[rank-2] > std::numeric_limits<int32_t>::max()) {
+    throw std::runtime_error("The second last dim does not fit into int32_t.");
+  }
+  if (x_dims[rank-1] > std::numeric_limits<int32_t>::max()) {
+    throw std::runtime_error("The last dim does not fit into int32_t.");
+  }
+
+  // given the above checks, we know it's safe
+  const int32_t x_dim1 = x_dims[rank-2];
+  const int32_t x_dim2 = x_dims[rank-1];
+
+  const int64_t n = x_dim0;
+  const int32_t h = 1;
+  const int32_t w = x_dim1;
+  const int32_t c = x_dim2;
+  dim3 grid((c + TILE_SIZE - 1) / TILE_SIZE, (h * w + TILE_SIZE - 1) / TILE_SIZE, n);
+  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
+  permute021_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
     static_cast<{{lib_dtype}}*>(out_ptr),
     static_cast<const {{lib_dtype}}*>(in_ptr),
     n,
     h,
     w,
-    c
+    c,
+    input_accessor
   );
 }
 } // namespace
@@ -164,24 +193,18 @@
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t rank,
+    const int64_t* x_dims,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
-  {{shape_function}}
   {{exec_paths}}
 }
-
 """
 )
 
@@ -189,8 +212,6 @@
 def gen_function(
     func_attrs: Dict[str, Any],
     template_path: str,
-    shape_eval_template,
-    shape_save_template,
     header_files: str,
     backend_spec,
 ) -> str:
@@ -201,8 +222,6 @@ def gen_function(
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
     header_files : str
         header files included in the function
     backend_spec : class
@@ -213,36 +232,31 @@ def gen_function(
     str
         Source code for function generated.
     """
-
     func_name = func_attrs["name"]
     x = func_attrs["inputs"][0]
+    tensor_accessor = func_attrs["input_accessors"][0]
     xdtype = x._attrs["dtype"]
-    shape_eval_func = shape_eval_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        x_dim0="*x_dim0",
-        x_dim1="*x_dim1",
-        x_dim2="*x_dim2",
+    tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+    input_accessor_name = "input_accessor"
+    input_accessor = tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+        name=input_accessor_name, tensor_accessor=tensor_accessor
     )
-    shape_save_func = shape_save_template.render(
-        indent="  ",
-        y_dim0="*y_dim0",
-        y_dim1="*y_dim1",
-        y_dim2="*y_dim2",
-    )
-    shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render()
+    exec_paths = EXEC_TEMPLATE.render(input_accessor_def=input_accessor)
+
     return SRC_TEMPLATE.render(
         function_name=func_name,
-        header_files=header_files,
-        shape_function=shape_func,
         exec_paths=exec_paths,
+        header_files=header_files,
         lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
+        tensor_accessor_libs=tensor_accessor_libs,
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -256,7 +270,6 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     str
         Function declaration
     """
-
     func_name = func_attrs["name"]
     return FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
@@ -264,7 +277,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -280,20 +297,18 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     str
         Driver code for invoking call
     """
-
     x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
+
+    input_accessor = func_attrs["input_accessors"][0]
+    xshape = input_accessor.original_shapes
+    x_dims = [dim._attrs["name"] for dim in xshape]
+
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dims=("{" + ", ".join(x_dims) + "}"),
+        rank=len(xshape),
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/permute102_common.py b/python/aitemplate/backend/common/tensor/permute102_common.py
index 7c367ed8a..1f83b5884 100644
--- a/python/aitemplate/backend/common/tensor/permute102_common.py
+++ b/python/aitemplate/backend/common/tensor/permute102_common.py
@@ -18,14 +18,23 @@
 For three dimension input, shift the first and the second dimension.
 i.e. Output[d1, d0, d2] = Input[d0, d1, d2]
 
-This is a naive modification over cutlass nhwc to nchw op:
-https://github.com/NVIDIA/cutlass/blob/master/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
-At implementation, it creates d1/32 x d2/32 x d0 blocks, each with 32 x 8 threads,
-and each thread processes 4 elements.
-
-We change the write stage of this cutlass permute op for d1 & d0.
-It might not be the most effecient version as applying different dimension on threads
-may relate to cache's performance.
+After determining the largest movable vectorized type fitting into d2,
+the implementation is based on two different kernels invoked depending
+on the number of items of that type d2 consists of:
+
+1. If the number is <= 16, the extension of the SMEM-tile approach (as
+used in permute021) is used to maintain coalesced reads from and writes
+to the global memory, with the SMEM layout for avoiding bank conflicts
+on store and load. This approach assumes that the last dimension can be
+fully covered with a single warp, hence can only work with the number
+being <= 32.
+
+2. If the number is > 16, the direct approach is used for copying
+d2-sized blocks along the last dimension directly from the input to the
+output global memory. This trivially corresponds to coalesced read and
+write of the whole d2-sized block. The cutoff of > 16 is chosen, as
+starting from 17 items, the approach #1 corresponds to the same data
+movement, just through the SMEM and with more index computation.
 """
 from typing import Any, Dict
 
@@ -38,13 +47,10 @@
 void {{func_name}}(
   const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
-  {{prefix}}Stream_t
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
+  {{prefix}}Stream_t /* stream */
 );
 """
 )
@@ -57,9 +63,6 @@
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
 {{indent}}    stream
 {{indent}});
 """
@@ -68,14 +71,114 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}permute102_launcher(
-{{indent}}    in_ptr,
-{{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
-{{indent}}    stream
-{{indent}});
+{% if dtype == "half" %}
+{{indent}}if (x_dim2 % 8 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<half>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "float" %}
+{{indent}}if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% elif dtype == "bfloat16" %}
+{{indent}}if (x_dim2 % 8 == 0) {
+{{indent}}  permute102_launcher<float4>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 8,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 4 == 0) {
+{{indent}}  permute102_launcher<float2>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 4,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else if (x_dim2 % 2 == 0) {
+{{indent}}  permute102_launcher<float>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2 / 2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}} else {
+{{indent}}  permute102_launcher<bfloat16>(
+{{indent}}      in_ptr,
+{{indent}}      out_ptr,
+{{indent}}      x_dim0,
+{{indent}}      x_dim1,
+{{indent}}      x_dim2,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+{% else %}
+{{indent}} static_assert(std::is_same_v<T, half> || std::is_same_v<T, float> || std::is_same_v<T, bfloat16>, "Unsupported dtype");
+{% endif %}
 {{indent}}return;
 """
 )
@@ -85,111 +188,185 @@
 {{header_files}}
 
 #define TILE_SIZE 32
-#define CH_K 4
+#define ITEMS_PER_THREAD 4
+#define DIRECT_BLOCK_Y 4
+#define DIRECT_BLOCK_Z 2
 
 namespace {
-template <typename T>
-__global__ void nhwc_to_nchw_kernel(T *output,
-                                    const T *input,
-                                    const int n,
-                                    const int h,
-                                    const int w,
-                                    const int c) {
-
-  const int hw = h*w;
-  const int hwc = hw*c;
-  __shared__ T shbuf[TILE_SIZE * (TILE_SIZE + 1)];
-  const int32_t tid  = threadIdx.y*blockDim.x + threadIdx.x;
-  const int32_t wid  = tid / TILE_SIZE;//th.y:0-7
-  const int32_t lid  = tid % TILE_SIZE;//th.x:0-31
-  const int32_t ni0   = blockIdx.z;
-  const int32_t hwi0  = blockIdx.y * TILE_SIZE;//parallel 8*seq 4
-  const int32_t ci0 = blockIdx.x * TILE_SIZE;//parallel 32
-  const size_t input_idx = ni0 * hwc + (hwi0 + wid) * c + ci0;
-  const T *A = input + input_idx;
-  if (ci0 + lid < c) {
-    const int lid_x_33 = lid * (TILE_SIZE + 1);
-    if ((hwi0 + TILE_SIZE - TILE_SIZE / CH_K) <= hw) {
-      int hwi = wid;  // between 0 and 7
-      #pragma unroll
-      for (int cLoopIdx = 0; cLoopIdx < CH_K; cLoopIdx++) {
-        shbuf[lid_x_33 + hwi] = A[lid];
-        A                     = &A[TILE_SIZE / CH_K * c];//because c is distributed on threads y
-        hwi += TILE_SIZE / CH_K;
-      }
-    } else {
-      for (int hwi = wid; hwi < TILE_SIZE; hwi += TILE_SIZE / CH_K) {
-        if ((hwi + hwi0) < hw) {
-          shbuf[lid_x_33 + hwi] = A[lid];
+template<typename T>
+__global__ void permute102_tiled_kernel(T* output,
+                                        const T *input,
+                                        const int M,
+                                        const int N,
+                                        const int D,
+                                        const int n) {
+  __shared__ T shbuf[TILE_SIZE * TILE_SIZE];
+
+  const int nD = n * D;
+  const int ND = N * D;
+  const int MD = M * D;
+  const int bxn = blockIdx.x * n;
+  const int DT = D * TILE_SIZE;
+  int x, y, i, tid, threadIdxY;
+
+  if (threadIdx.x < nD) {
+    x = blockIdx.x * nD + threadIdx.x;
+    if (x < ND) {
+      threadIdxY = threadIdx.y;
+      if ((blockIdx.y + 1) * TILE_SIZE <= M) {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        }
+      } else {
+        #pragma unroll
+        for (i = 0; i < ITEMS_PER_THREAD; ++i) {
+          y = blockIdx.y * TILE_SIZE + threadIdxY;
+          if (y >= M) break;
+          shbuf[threadIdxY * TILE_SIZE + (D * threadIdxY + threadIdx.x) % TILE_SIZE] =
+            input[y * ND + x];
+          threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
         }
-        A = &A[TILE_SIZE / CH_K * c];
       }
     }
   }
+
   __syncthreads();
 
-  const int32_t hwiOut = hwi0 + lid;
-  const int nc = n*c;
-  output = &output[hwiOut*nc];
-  if(hwiOut < hw){
-    if(ci0 + TILE_SIZE < c){
-      int cI = wid;
+  threadIdxY = threadIdx.y;
+  if ((blockIdx.x + 1) * n <= N) {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        output[(bxn + y) * MD + blockIdx.y * DT + x] =
+          shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    } else {
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
+      }
+    }
+  } else {
+    if ((blockIdx.y + 1) * TILE_SIZE * D <= MD) {
       #pragma unroll
-      for(int hwLoopIdx = 0; hwLoopIdx < CH_K; ++hwLoopIdx){
-          output[ni0*c + ci0 + cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
-          cI += TILE_SIZE / CH_K;
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
+        }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
       }
     } else {
-      for(int cI = wid; cI < TILE_SIZE; cI += TILE_SIZE / CH_K){
-        if(ci0+cI<c){
-          output[ni0*c+ci0+cI] = shbuf[(cI)* (TILE_SIZE + 1) + lid];
+      #pragma unroll
+      for (i = 0; i < ITEMS_PER_THREAD; i++) {
+        tid = threadIdxY * TILE_SIZE + threadIdx.x;
+        x = tid % DT;
+        y = tid / DT;
+        if (bxn + y < N && blockIdx.y * DT + x < MD) {
+          output[(bxn + y) * MD + blockIdx.y * DT + x] =
+            shbuf[(x / D) * TILE_SIZE + (D * y + x) % TILE_SIZE];
         }
+        threadIdxY += TILE_SIZE / ITEMS_PER_THREAD;
+        if (threadIdxY >= nD) break;
       }
     }
   }
 }
 
+template <typename T>
+__global__ void permute102_direct_kernel(T* output,
+                                         const T *input,
+                                         const int M,
+                                         const int N,
+                                         const int D) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if (x < D && y < N) {
+    int bound = min(M, (blockIdx.z + 1) * TILE_SIZE);
+    for (int z = blockIdx.z * TILE_SIZE + threadIdx.z; z < bound; z += DIRECT_BLOCK_Z) {
+      output[y * M * D + z * D + x] = input[z * N * D + y * D + x];
+    }
+  }
+}
+
+template <typename T>
 void permute102_launcher(const void* in_ptr,
                          void* out_ptr,
                          int x_dim0,
                          int x_dim1,
                          int x_dim2,
                          {{prefix}}Stream_t stream) {
-  const int n = x_dim0;
-  const int h = 1;
-  const int w = x_dim1;
-  const int c = x_dim2;
-  dim3 grid((c + TILE_SIZE - 1)/TILE_SIZE, (h*w + TILE_SIZE -1)/TILE_SIZE, n);
-  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
-  nhwc_to_nchw_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
-    static_cast<{{lib_dtype}}*>(out_ptr),
-    static_cast<const {{lib_dtype}}*>(in_ptr),
-    n,
-    h,
-    w,
-    c
-  );
+  const int M = x_dim0;
+  const int N = x_dim1;
+  const int D = x_dim2;
+
+  if (D <= 16) {
+    // each warp reads n x d coalesced items of input
+    const int d = min(TILE_SIZE, D);
+    const int n = TILE_SIZE / d;
+
+    dim3 grid((N + n - 1) / n, (M + TILE_SIZE - 1) / TILE_SIZE);
+    dim3 block(TILE_SIZE, TILE_SIZE / ITEMS_PER_THREAD);
+
+    permute102_tiled_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D,
+      n
+    );
+  } else {
+    dim3 grid((D + 31) / 32, (N + DIRECT_BLOCK_Y - 1) / DIRECT_BLOCK_Y, (M + TILE_SIZE - 1) / TILE_SIZE);
+    dim3 block(32, DIRECT_BLOCK_Y, DIRECT_BLOCK_Z);  // x = 32, the warp size
+
+    permute102_direct_kernel<T><<<grid, block, 0, stream>>>(
+      static_cast<T*>(out_ptr),
+      static_cast<const T*>(in_ptr),
+      M,
+      N,
+      D
+    );
+  }
 }
 } // namespace
 
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
-  {{shape_function}}
   {{exec_paths}}
 }
 
@@ -200,8 +377,6 @@
 def gen_function(
     func_attrs: Dict[str, Any],
     template_path: str,
-    shape_eval_template,
-    shape_save_template,
     header_files: str,
     backend_spec,
 ) -> str:
@@ -212,8 +387,8 @@ def gen_function(
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
+    header_files : str
+        header files included in the function
     backend_spec : class
         specifies backend configs
 
@@ -225,32 +400,22 @@ def gen_function(
     func_name = func_attrs["name"]
     x = func_attrs["inputs"][0]
     xdtype = x._attrs["dtype"]
-    shape_eval_func = shape_eval_template.render(
+    exec_paths = EXEC_TEMPLATE.render(
         indent="  ",
-        dtype="int64_t ",
-        x_dim0="*x_dim0",
-        x_dim1="*x_dim1",
-        x_dim2="*x_dim2",
+        dtype=backend_spec.dtype_to_backend_type(xdtype),
     )
-    shape_save_func = shape_save_template.render(
-        indent="  ",
-        y_dim0="*y_dim0",
-        y_dim1="*y_dim1",
-        y_dim2="*y_dim2",
-    )
-    shape_func = shape_eval_func + shape_save_func
-    exec_paths = EXEC_TEMPLATE.render()
     return SRC_TEMPLATE.render(
         function_name=func_name,
-        shape_function=shape_func,
         exec_paths=exec_paths,
         header_files=header_files,
-        lib_dtype=backend_spec.dtype_to_lib_type(xdtype),
         prefix=backend_spec.prefix,
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -271,7 +436,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -290,16 +459,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/permute210_common.py b/python/aitemplate/backend/common/tensor/permute210_common.py
index 35894b315..2183101ec 100644
--- a/python/aitemplate/backend/common/tensor/permute210_common.py
+++ b/python/aitemplate/backend/common/tensor/permute210_common.py
@@ -37,12 +37,9 @@
 void {{func_name}}(
   const void* /* input */,
   void* /* output */,
-  int64_t* /* x_dim0 */,
-  int64_t* /* x_dim1 */,
-  int64_t* /* x_dim2 */,
-  int64_t* /* y_dim0 */,
-  int64_t* /* y_dim1 */,
-  int64_t* /* y_dim2 */,
+  int64_t /* x_dim0 */,
+  int64_t /* x_dim1 */,
+  int64_t /* x_dim2 */,
   {{prefix}}Stream_t /* stream */
 );
 """
@@ -56,9 +53,6 @@
 {{indent}}    {{x_dim0}},
 {{indent}}    {{x_dim1}},
 {{indent}}    {{x_dim2}},
-{{indent}}    {{y_dim0}},
-{{indent}}    {{y_dim1}},
-{{indent}}    {{y_dim2}},
 {{indent}}    stream
 {{indent}});
 """
@@ -70,9 +64,9 @@
 {{indent}}permute210_launcher(
 {{indent}}    in_ptr,
 {{indent}}    out_ptr,
-{{indent}}    *x_dim0,
-{{indent}}    *x_dim1,
-{{indent}}    *x_dim2,
+{{indent}}    x_dim0,
+{{indent}}    x_dim1,
+{{indent}}    x_dim2,
 {{indent}}    stream
 {{indent}});
 {{indent}}return;
@@ -84,6 +78,7 @@
 {{header_files}}
 
 #define TILE_SIZE 32
+#define CH_K 4
 
 namespace {
 template <typename T>
@@ -95,7 +90,7 @@
   __shared__ T shbuf[TILE_SIZE][TILE_SIZE + 1];
 
   int32_t strides[2] = { c * w, w };
-  int32_t offset = blockIdx.y * strides[1]; // We are slicing through static c.
+  int32_t offset = blockIdx.y * strides[1];  // We are slicing through static c.
 
   int32_t xBlock = blockIdx.x * TILE_SIZE;
   int32_t yBlock = blockIdx.z * TILE_SIZE;
@@ -106,21 +101,21 @@
   const T *A = input + inputIdx;
 
   if (x < w) {
-    if (y + 24 < n) { // This guards (y, y+8, y+16, y+24) are within boundary.
+    if (y + 24 < n) {  // This guards (y, y+8, y+16, y+24) are within boundary.
       int tid = threadIdx.y;
       #pragma unroll
-      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+      for (int loopIdx = 0; loopIdx < CH_K; loopIdx++) {
         shbuf[threadIdx.x][tid] = A[threadIdx.x];
-        A                       = &A[8 * strides[0]];
-        tid += 8;
+        A                       = &A[TILE_SIZE / CH_K * strides[0]];
+        tid += TILE_SIZE / CH_K;
       }
     } else {
       #pragma unroll
-      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+      for (int tid = threadIdx.y; tid < TILE_SIZE; tid += TILE_SIZE / CH_K) {
         if (yBlock + tid < n) {
           shbuf[threadIdx.x][tid] = A[threadIdx.x];
         }
-        A = &A[8 * strides[0]];
+        A = &A[TILE_SIZE / CH_K * strides[0]];
       }
     }
   }
@@ -141,18 +136,18 @@
     if (y + 24 < w) {
       int tid = threadIdx.y;
       #pragma unroll
-      for (int loopIdx = 0; loopIdx < 4; loopIdx++) {
+      for (int loopIdx = 0; loopIdx < CH_K; loopIdx++) {
         output[threadIdx.x] = shbuf[tid][threadIdx.x];
-        output              = &output[8 * strides[0]];
-        tid += 8;
+        output              = &output[TILE_SIZE / CH_K * strides[0]];
+        tid += TILE_SIZE / CH_K;
       }
     } else {
       #pragma unroll
-      for (int tid = threadIdx.y; tid < 32; tid += 8) {
+      for (int tid = threadIdx.y; tid < TILE_SIZE; tid += TILE_SIZE / CH_K) {
         if (yBlock + tid < w) {
           output[threadIdx.x] = shbuf[tid][threadIdx.x];
         }
-        output = &output[8 * strides[0]];
+        output = &output[TILE_SIZE / CH_K * strides[0]];
       }
     }
   }
@@ -164,8 +159,8 @@
                          int x_dim1,
                          int x_dim2,
                          {{prefix}}Stream_t stream) {
-  dim3 grid((x_dim2 + (TILE_SIZE-1))/TILE_SIZE, x_dim1, (x_dim0 + (TILE_SIZE-1))/TILE_SIZE);
-  dim3 block(TILE_SIZE, TILE_SIZE/4);
+  dim3 grid((x_dim2 + TILE_SIZE - 1) / TILE_SIZE, x_dim1, (x_dim0 + TILE_SIZE - 1) / TILE_SIZE);
+  dim3 block(TILE_SIZE, TILE_SIZE / CH_K);
   permute210_kernel<{{lib_dtype}}><<<grid, block, 0, stream>>>(
     static_cast<{{lib_dtype}}*>(out_ptr),
     static_cast<const {{lib_dtype}}*>(in_ptr),
@@ -179,19 +174,16 @@
 void {{function_name}} (
     const void* in_ptr,
     void* out_ptr,
-    int64_t* x_dim0,
-    int64_t* x_dim1,
-    int64_t* x_dim2,
-    int64_t* y_dim0,
-    int64_t* y_dim1,
-    int64_t* y_dim2,
+    int64_t x_dim0,
+    int64_t x_dim1,
+    int64_t x_dim2,
     {{prefix}}Stream_t stream
 ) {
   if (!in_ptr) {
     throw std::runtime_error("in_ptr is NULL!");
   }
   if (!out_ptr) {
-    throw std::runtime_error("in_ptr is NULL!");
+    throw std::runtime_error("out_ptr is NULL!");
   }
   {{exec_paths}}
 }
@@ -200,7 +192,11 @@
 )
 
 
-def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) -> str:
+def gen_function(
+    func_attrs: Dict[str, Any],
+    header_files: str,
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -229,7 +225,10 @@ def gen_function(func_attrs: Dict[str, Any], header_files: str, backend_spec) ->
     )
 
 
-def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
+def gen_function_decl(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+) -> str:
     """
     Parameters
     ----------
@@ -250,7 +249,11 @@ def gen_function_decl(func_attrs: Dict[str, Any], backend_spec) -> str:
     )
 
 
-def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") -> str:
+def gen_function_call(
+    func_attrs: Dict[str, Any],
+    backend_spec,
+    indent="  ",
+) -> str:
     """
     Parameters
     ----------
@@ -269,16 +272,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         in_ptr=x._attrs["name"],
         out_ptr=y._attrs["name"],
-        x_dim0="&" + xshape[0]._attrs["name"],
-        x_dim1="&" + xshape[1]._attrs["name"],
-        x_dim2="&" + xshape[2]._attrs["name"],
-        y_dim0="&" + yshape[0]._attrs["name"],
-        y_dim1="&" + yshape[1]._attrs["name"],
-        y_dim2="&" + yshape[2]._attrs["name"],
+        x_dim0=xshape[0]._attrs["name"],
+        x_dim1=xshape[1]._attrs["name"],
+        x_dim2=xshape[2]._attrs["name"],
         indent=indent,
     )
diff --git a/python/aitemplate/backend/common/tensor/slice_common.py b/python/aitemplate/backend/common/tensor/slice_common.py
index f42f213f2..49961916a 100644
--- a/python/aitemplate/backend/common/tensor/slice_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_common.py
@@ -239,11 +239,13 @@
 
 enum class LoadVecType {
   VT_HALF = 0,
+  VT_BFLOAT16 = 0,
   VT_FLOAT,
   VT_FLOAT2,
   VT_FLOAT4
 };
 
+
 template <typename ELEM_T>
 static inline LoadVecType get_vec_type(int64_t dim_size) {
   {{index_type}}  size_elem_t = sizeof(ELEM_T);
@@ -259,7 +261,11 @@
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
   HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-  HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  if constexpr (std::is_same_v<ELEM_T, half>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+  } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+  }
 
 #undef HANDLE_ONE_VEC_TYPE
   throw std::runtime_error(
@@ -285,6 +291,11 @@
       break;
     }
   }
+  // We have a full slice for the entire input
+  if (flatten_index == -1) {
+    flatten_index = 0;
+  }
+
   int64_t input_start_offset =
       compute_input_linear_index<Rank>(input_strides,
                                        slice_start_indices,
@@ -342,15 +353,36 @@
 
   slice_meta_data.num_elems[input_idx] = 1;
   for ({{index_type}}  i = 0; i < Rank; i++) {
-    assert(slice_start_indices[i] >= 0 &&
-           slice_start_indices[i] <= input_shape[i]);
-    assert(slice_end_indices[i] >= 0 && slice_end_indices[i] <= input_shape[i]);
-    assert(slice_start_indices[i] <= slice_end_indices[i]);
-
-    slice_meta_data.num_elems[input_idx] *=
-        slice_end_indices[i] - slice_start_indices[i];
-    slice_meta_data.slice_start_indices[input_idx][i] = slice_start_indices[i];
-    slice_meta_data.slice_end_indices[input_idx][i] = slice_end_indices[i];
+    int64_t slice_start_idx = slice_start_indices[i];
+    int64_t slice_end_idx = slice_end_indices[i];
+    int64_t input_dim = input_shape[i];
+
+    if (!(slice_start_idx >= 0 && slice_start_idx <= input_dim)) {
+        throw std::runtime_error("invalid slice_start_idx: " +
+            std::to_string(slice_start_idx) +
+            ", input_dim: " +
+            std::to_string(input_dim) +
+            ", i: " + std::to_string(i));
+    }
+    if (!(slice_end_idx >= 0 && slice_end_idx <= input_dim)) {
+        throw std::runtime_error("invalid slice_end_idx: " +
+            std::to_string(slice_end_idx) +
+            ", input_dim: " +
+            std::to_string(input_dim) +
+            ", i: " + std::to_string(i));
+    }
+    if (slice_start_idx > slice_end_idx) {
+        throw std::runtime_error(
+            "expected slice_start_idx <= slice_end_idx but got slice_start_idx: " +
+            std::to_string(slice_start_idx) +
+            " and slice_end_idx: " +
+            std::to_string(slice_end_idx) +
+            ", i: " + std::to_string(i));
+    }
+
+    slice_meta_data.num_elems[input_idx] *= slice_end_idx - slice_start_idx;
+    slice_meta_data.slice_start_indices[input_idx][i] = slice_start_idx;
+    slice_meta_data.slice_end_indices[input_idx][i] = slice_end_idx;
   }
 
   slice_meta_data.dim_sizes[input_idx] =
@@ -363,6 +395,7 @@
           {{index_type}}  ElemsPerThread, {{index_type}}  ThreadsPerBlock>
 void slice_scatter_kernel_launcher(
     ELEM_T *output,
+    {{index_type}} output_offset,
     const int64_t *output_shape,
     const ELEM_T *inputs[],
     const int64_t *input_shapes[],
@@ -376,10 +409,20 @@
 
   // meta data for placing sliced output
   scatter_meta_data.output_strides[Rank-1] = 1;
+  if (output_shape[Rank-1] < 0) {
+    throw std::runtime_error("invalid output_shape[Rank-1]: " +
+        std::to_string(output_shape[Rank-1]) +
+        ", Rank: " + std::to_string(Rank));
+  }
   scatter_meta_data.output_shape[Rank-1] = output_shape[Rank-1];
   for ({{index_type}}  i = Rank - 2; i >= 0; i--) {
     scatter_meta_data.output_strides[i] =
         scatter_meta_data.output_strides[i+1] * output_shape[i+1];
+    if (output_shape[i] < 0) {
+      throw std::runtime_error("invalid output_shape[i]: " +
+          std::to_string(output_shape[i]) +
+          ", i: " + std::to_string(i));
+    }
     scatter_meta_data.output_shape[i] = output_shape[i];
   }
 
@@ -394,7 +437,7 @@
     scatter_dim_offset += slice_meta_data.dim_sizes[i];
   }
 
-  LoadVecType min_vec_type = LoadVecType::VT_FLOAT4;
+  LoadVecType min_vec_type = get_vec_type<ELEM_T>(output_offset);
   for ({{index_type}}  i = 0; i < NumInputs; i++) {
     LoadVecType vec_type = get_input_vec_type<ELEM_T, Rank>(
         scatter_meta_data.output_strides,
@@ -416,35 +459,41 @@
     }
   }
 
+  if (max_num_elems <= 0) {
+    throw std::runtime_error("invalid max_num_elems: " +
+        std::to_string(max_num_elems));
+  }
+
   {{index_type}}  m = max_num_elems % (ThreadsPerBlock * ElemsPerThread) != 0;
   {{index_type}}  num_blocks_x =
       (max_num_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
   dim3 grid_config = dim3(num_blocks_x, NumInputs);
 
 #define HANDLE_ONE_VEC_TYPE(load_vec_type, vec_type)                          \\
-    case load_vec_type: {                                                     \\
+    if (min_vec_type == load_vec_type) {                                      \\
       if (ElemsPerThread * sizeof(ELEM_T) < sizeof(vec_type)) {               \\
          throw std::runtime_error(                                            \\
            std::string("No valid kernel available for ") + #vec_type);        \\
       }                                                                       \\
       slice_scatter_kernel<vec_type, ELEM_T, Rank, NumInputs, ElemsPerThread> \\
         <<<grid_config, ThreadsPerBlock, 0, stream>>>(                        \\
-            output,                                                           \\
+            output + output_offset,                                           \\
             slice_meta_data,                                                  \\
             scatter_meta_data);                                               \\
       LAUNCH_CHECK_SLICE();                                                   \\
-      break;                                                                  \\
+      return;                                                                 \\
     }
 
-  switch (min_vec_type) {
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT4, float4)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT2, float2)
     HANDLE_ONE_VEC_TYPE(LoadVecType::VT_FLOAT, float)
-    HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
-    default:
-      throw std::runtime_error("Invalid LoadVecType\\n");
-  }
+    if constexpr (std::is_same_v<ELEM_T, half>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_HALF, half)
+    } else if constexpr (std::is_same_v<ELEM_T, bfloat16>) {
+      HANDLE_ONE_VEC_TYPE(LoadVecType::VT_BFLOAT16, bfloat16)
+    }
 
+  throw std::runtime_error("Invalid LoadVecType\\n");
 #undef HANDLE_ONE_VEC_TYPE
 }
 
@@ -457,6 +506,12 @@
   std::vector<int64_t> slice_start_indices(rank);
   std::vector<int64_t> slice_end_indices(rank);
   for ({{index_type}}  i = 0; i < rank; i++) {
+    if (input_shape[i] < 0) {
+        throw std::runtime_error("invalid input_shape: " +
+            std::to_string(input_shape[i]) +
+            ", i: " +
+            std::to_string(i));
+    }
     slice_start_indices[i] = orig_slice_start_indices[i] < 0 ?
                              input_shape[i] + orig_slice_start_indices[i]:
                              orig_slice_start_indices[i];
@@ -509,7 +564,8 @@
 {{indent}}                                {{num_inputs}}/*NumInputs*/,
 {{indent}}                                {{elems_per_thread}}/*ElemsPerThread*/,
 {{indent}}                                {{threads_per_block}}/*ThreadsPerBlock*/>(
-{{indent}}      static_cast<{{elem_type}}*>(output), local_output_shape, reinterpret_cast<const {{elem_type}}**>(inputs), input_shapes,
+{{indent}}      static_cast<{{elem_type}}*>(output), {{output_offset}}, local_output_shape,
+{{indent}}      reinterpret_cast<const {{elem_type}}**>(inputs), input_shapes,
 {{indent}}      slice_start_indices, slice_end_indices, scatter_dim, stream);
 {{indent}}  return;
 {{indent}}}
@@ -540,6 +596,9 @@
   if (scatter_dim >= rank) {
     throw std::runtime_error("scatter_dim must < rank!");
   }
+  if (num_inputs < 1) {
+    throw std::runtime_error("num_inputs must be larger than 0!");
+  }
 
   // clip slip start and end indices
   std::vector<std::vector<int64_t>> slice_start_indices(num_inputs);
@@ -691,6 +750,7 @@ def gen_function(
     func_attrs,
     backend_spec,
     elems_per_thread=8,
+    output_offset=0,
     update_output_shape=True,
     element_func=None,
     element_func_def=None,
@@ -741,6 +801,7 @@ def gen_function(
         elem_type=input_type,
         elems_per_thread=elems_per_thread,
         threads_per_block=128,
+        output_offset=output_offset,
     )
 
     shape_func = SHAPE_UPDATE_FUNC.render(
diff --git a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
index b8901a062..49d574548 100644
--- a/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
+++ b/python/aitemplate/backend/common/tensor/slice_reshape_scatter_common.py
@@ -19,7 +19,7 @@
 
 import jinja2
 
-from . import slice_common
+from aitemplate.backend.common.tensor import slice_common
 
 OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
     """
@@ -80,9 +80,19 @@ def gen_function(
     # TODO: consider to profile elems_per_thread
     elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
     element_func_def = None if element_func is None else tanh_def.render()
+    # slice_reshape_scatter is a temporary solution for a special fusion pattern.
+    # It will be replaced with a more general slice + concat pass once it's
+    # ready. Second, the constrains of slice_reshape_scatter ensure that its
+    # output_accessor's stride is actually linear offset in the output tensor.
+    # So, let's not to pollute a common slice kernel with output TensorAccessors
+    # at the moment since we do not support output TensorAccessors for slice
+    # op yet, which may have perf implication to the kernel as well.
+    output_accessor = func_attrs["output_accessors"][0]
+    output_offset = output_accessor.offset
     return slice_common.gen_function(
         func_attrs,
         backend_spec=backend_spec,
+        output_offset=output_offset,
         elems_per_thread=elems_per_thread,
         update_output_shape=False,
         element_func=element_func,
diff --git a/python/aitemplate/backend/common/tensor/topk_common.py b/python/aitemplate/backend/common/tensor/topk_common.py
index 044833bc0..e6d89d714 100644
--- a/python/aitemplate/backend/common/tensor/topk_common.py
+++ b/python/aitemplate/backend/common/tensor/topk_common.py
@@ -37,7 +37,7 @@
 
 {{func_signature}}
 {
-    topk_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output);
+    topk_launcher<{{dtype}}>(stream, elem_cnt, instance_size, instance_num, top_k, input, workspace, output_index, output_value);
 }
     """
 )
@@ -74,7 +74,8 @@
 
 FUNC_SIGNATURE = jinja2.Template(
     """
-void {{func_name}}(int64_t* output,
+void {{func_name}}(int64_t* output_index,
+                   void* output_value,
                    const void* input,
                    const {{index_type}} elem_cnt,
                    const {{index_type}} instance_size,
@@ -94,7 +95,9 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}   {{output}}, {{input}},
+{{indent}}   {{output_index}},
+{{indent}}   {{output_value}},
+{{indent}}   {{input}},
 {{indent}}    {{elem_cnt}},
 {{indent}}    {{instance_size}},
 {{indent}}    {{instance_num}},
@@ -119,32 +122,65 @@
 }
 
 template <typename T>
-T GetZeroVal() {
-  return static_cast<T>(0);
-}
+struct NumericTraits;
 
-template <typename T>
-T GetOneVal() {
-  return static_cast<T>(1);
-}
+template<>
+struct NumericTraits<half> {
+  __host__ __device__
+  static half zero() {
+    return 0;
+  }
 
-template <typename T>
-T GetMinVal() {
-  uint16_t ret = 0xfbff;
-  return *(T*)&ret;
-}
+  __host__ __device__
+  static half one() {
+    uint16_t ret = 0x3c00;
+    return *reinterpret_cast<half*>(&ret);
+  }
 
-template <typename T>
-T GetMaxVal() {
-  uint16_t ret = 0x7bff;
-  return *(T*)&ret;
-}
+  __host__ __device__
+  static half min() {
+    uint16_t ret = 0xfbff;
+    return *reinterpret_cast<half*>(&ret);
+  }
+
+  __host__ __device__
+  static half max() {
+    uint16_t ret = 0x7bff;
+    return *reinterpret_cast<half*>(&ret);
+  }
+};
+
+template<>
+struct NumericTraits<float> {
+
+  __host__ __device__
+  static float zero() {
+    return 0.0;
+  }
+
+  __host__ __device__
+  static float one() {
+    return 1.0;
+  }
+
+  __host__ __device__
+  static float min() {
+    uint32_t ret = 0xff7fffff;
+    return *reinterpret_cast<float*>(&ret);
+  }
+
+  __host__ __device__
+  static float max() {
+    uint32_t ret = 0x7f7fffff;
+    return *reinterpret_cast<float*>(&ret);
+  }
+};
 
 template <typename T>
 T PowOf2Floor(T val, int64_t max_power) {
   T max_floor = static_cast<T>(std::pow(2, max_power));
   val = std::min(val, max_floor);
-  T ret = GetOneVal<T>();
+  T ret = (T) 1;
   while (true) {
     ret *= 2;
     if (ret >= val) {
@@ -157,7 +193,7 @@
 T PowOf2Ceil(T val, int64_t max_power) {
   T max_ceil = static_cast<T>(std::pow(2, max_power));
   val = std::min(val, max_ceil);
-  T ret = GetOneVal<T>();
+  T ret = (T) 1;
   while (true) {
     ret *= 2;
     if (ret >= val) {
@@ -475,7 +511,8 @@ class TmpBufferManager final {
     const int64_t heap_size,
     const int64_t init_index,
     const T init_value,
-    int64_t* out_ptr) {
+    int64_t* out_index_ptr,
+    T* out_value_ptr) {
   extern __shared__ char smem[];
   auto* shared_entries = reinterpret_cast<Entry<T>*>(smem);
 
@@ -506,7 +543,8 @@ class TmpBufferManager final {
 
   // Write top_k elements in sorted array to output
   for (int64_t i = threadIdx.x; i < k; i += blockDim.x) {
-    (out_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+    (out_index_ptr + blockIdx.x * k)[i] = shared_entries[i].GetIndex();
+    (out_value_ptr + blockIdx.x * k)[i] = shared_entries[i].GetValue();
   }
 }
 // ALIGNPTR
@@ -533,7 +571,8 @@ class TmpBufferManager final {
     const int top_k,
     const void* input,
     void* workspace,
-    void* output) {
+    void* output_index,
+    void* output_value) {
   const int32_t k = std::min(top_k, instance_size);
 
   if (top_k < 100) {
@@ -558,9 +597,10 @@ class TmpBufferManager final {
             instance_size,
             k,
             heap_size,
-            GetMaxVal<int64_t>(),
-            GetMinVal<T>(),
-            (int64_t*)output);
+            std::numeric_limits<int64_t>::max(),
+            NumericTraits<T>::min(),
+            (int64_t*)output_index,
+            (T*)output_value);
 
   } else {
     const uintptr_t ALIGNMENT = 32;
@@ -588,7 +628,7 @@ class TmpBufferManager final {
         stream);
 
     {{prefix}}Memcpy2DAsync(
-        (int64_t*)output,
+        (int64_t*)output_index,
         k * sizeof(int64_t),
         buf_manager.SortedIndicesPtr(),
         instance_size * sizeof(int64_t),
@@ -596,6 +636,16 @@ class TmpBufferManager final {
         instance_num,
         {{prefix}}MemcpyDefault,
         stream);
+
+    {{prefix}}Memcpy2DAsync(
+        (T*)output_value,
+        k * sizeof(T),
+        buf_manager.SortedInPtr(),
+        instance_size * sizeof(T),
+        k * sizeof(T),
+        instance_num,
+        {{prefix}}MemcpyDefault,
+        stream);
   }
 }
     """
@@ -673,12 +723,12 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
     str
         Rendered function call.
     """
-    output_name = ""
-    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["outputs"]) == 2
     assert len(func_attrs["inputs"]) == 1
 
-    output_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
-        name=func_attrs["outputs"][0]._attrs["name"]
+    output_value_name = func_attrs["outputs"][0]._attrs["name"]
+    output_index_name = FUNC_CALL_INT64_PARAM_TEMPLATE.render(
+        name=func_attrs["outputs"][1]._attrs["name"]
     )
     input_name = func_attrs["inputs"][0]._attrs["name"]
 
@@ -693,7 +743,8 @@ def gen_function_call(func_attrs: Dict[str, Any], backend_spec, indent="  ") ->
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
-        output=output_name,
+        output_index=output_index_name,
+        output_value=output_value_name,
         input=input_name,
         elem_cnt=elem_cnt,
         instance_size=instance_size,
diff --git a/python/aitemplate/backend/common/tensor_accessor_codegen.py b/python/aitemplate/backend/common/tensor_accessor_codegen.py
index 6d6174d27..bb1014fdf 100644
--- a/python/aitemplate/backend/common/tensor_accessor_codegen.py
+++ b/python/aitemplate/backend/common/tensor_accessor_codegen.py
@@ -20,10 +20,10 @@
 from typing import List
 
 import jinja2
+from aitemplate.backend.target import Target
 
-from ...compiler.tensor_accessor import TensorAccessor
-from ...utils import alignment
-from ..target import Target
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment
 
 # Template used to transform a Python TensorAccessor object
 # to a C++ TensorAccessor struct.
@@ -85,29 +85,33 @@ def find_max_alignment_for_accessor(accessor: TensorAccessor) -> int:
     int
         the max alignment value
     """
-    align = alignment.find_max_alignment(accessor.offset)
+    align = alignment.find_max_alignment(accessor.offset, accessor.tensor_dtype)
     if not accessor.is_contiguous:
         align = min(
             align,
             alignment.find_max_alignment(
-                accessor.original_total_elements_from_stride_dim
+                accessor.original_total_elements_from_stride_dim, accessor.tensor_dtype
             ),
         )
         align = min(
             align,
             alignment.find_max_alignment(
-                accessor.actual_total_elements_from_stride_dim
+                accessor.actual_total_elements_from_stride_dim, accessor.tensor_dtype
             ),
         )
     return align
 
 
-def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
+def find_max_alignment_for_accessors(
+    dtype: str, accessors: List[TensorAccessor]
+) -> int:
     """the max alignment value that meets the requirement specified by
-       the accessors
+       the accessors and dtype
 
     Parameters
     ----------
+    dtype: str
+        dtype of the tensor for which the accessors are attached
     accessors: List[TensorAccessor]
         TensorAccessor(s) attached to the relevant tensor being accessed
 
@@ -116,14 +120,16 @@ def find_max_alignment_for_accessors(accessors: List[TensorAccessor]) -> int:
     int
         the max alignment value
     """
-    align = max(alignment.ALIGNMENTS)
+    align = max(alignment.get_alignments(dtype))
     # Handle accessors
     for accessor in accessors:
         align = min(align, find_max_alignment_for_accessor(accessor))
     return align
 
 
-def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> int:
+def find_max_alignment(
+    num_elements: int, dtype: str, accessors: List[TensorAccessor]
+) -> int:
     """find the max alignment value that meets the requirement of accessing
        num_elements of data with access patterns (strides and offsets)
        specified by accessors
@@ -132,6 +138,8 @@ def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> in
     ----------
     num_elements: int
         specify the number of elements being accessed
+    dtype: str
+        dtype of the tensor for which the accessors are attached
 
     accessors: List[TensorAccessor]
         TensorAccessor(s) attached to the relevant tensor being accessed
@@ -142,6 +150,6 @@ def find_max_alignment(num_elements: int, accessors: List[TensorAccessor]) -> in
         the max alignment value
     """
     # get initial alignment based on the number of elements being accessed
-    align = alignment.find_max_alignment(num_elements)
-    accessor_alignment = find_max_alignment_for_accessors(accessors)
+    align = alignment.find_max_alignment(num_elements, dtype)
+    accessor_alignment = find_max_alignment_for_accessors(dtype, accessors)
     return min(align, accessor_alignment)
diff --git a/python/aitemplate/backend/common/upsampling2d_common.py b/python/aitemplate/backend/common/upsampling2d_common.py
index 736ee6482..c1b94a217 100644
--- a/python/aitemplate/backend/common/upsampling2d_common.py
+++ b/python/aitemplate/backend/common/upsampling2d_common.py
@@ -50,21 +50,27 @@
   for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 {% if mode == "bilinear"%}
-__global__ void bilinear_upsampling_f16_nhwc_kernel(const half2* input,
+__global__ void bilinear_upsampling_nhwc_kernel(const {{dtype}}* input_raw,
                                                     {% if bias_add %}
-                                                      const half2* input_res,
+                                                      const {{dtype}}* input_res_raw,
                                                     {% endif %}
-                                                    half2* output,
+                                                    {{dtype}}* output_raw,
                                                     const {{index_type}} batch,
                                                     const {{index_type}} in_height,
                                                     const {{index_type}} in_width,
                                                     const {{index_type}} channels,
                                                     const {{index_type}} out_height,
                                                     const {{index_type}} out_width) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+{% if bias_add %}
+  const {{vec_dtype}}* input_res = (const {{vec_dtype}}*)input_res_raw;
+{% endif %}
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
 
-    const float height_scale = in_height / static_cast<float>(out_height);
-    const float width_scale = in_width / static_cast<float>(out_width);
-    const int64_t num_threads = out_height * out_width * channels * batch;
+  const float height_scale = in_height / static_cast<float>(out_height);
+  const float width_scale = in_width / static_cast<float>(out_width);
+  const int64_t num_threads = out_height * out_width * channels * batch;
 
 GPU_1D_KERNEL_LOOP(out_idx, num_threads) {
     int64_t idx = out_idx;
@@ -87,46 +93,61 @@
         (in_x < in_width - 1) ? ceilf(in_x) : in_width - 1;
     const float x_lerp = in_x - floorf(in_x);
 
-    const half2 top_left = __ldg(
+    const {{vec_dtype}} top_left = __ldg(
         input + ((b * in_height + top_y_index) * in_width + left_x_index) *
                    channels +
                c);
 
-    const half2 top_right = __ldg(
+    const {{vec_dtype}} top_right = __ldg(
         input + ((b * in_height + top_y_index) * in_width + right_x_index) *
                    channels +
                c);
-    const half2 bottom_left = __ldg(
+    const {{vec_dtype}} bottom_left = __ldg(
         input + ((b * in_height + bottom_y_index) * in_width + left_x_index) *
                    channels +
                c);
-    const half2 bottom_right = __ldg(
+    const {{vec_dtype}} bottom_right = __ldg(
         input + ((b * in_height + bottom_y_index) * in_width + right_x_index) *
                    channels +
                c);
 
+{% if dtype == "half" %}
     float top_x = __half2float(top_left{{half2_data_ref}}.x) + (__half2float(top_right{{half2_data_ref}}.x) - __half2float(top_left{{half2_data_ref}}.x)) * x_lerp;
     float top_y = __half2float(top_left{{half2_data_ref}}.y) + (__half2float(top_right{{half2_data_ref}}.y) - __half2float(top_left{{half2_data_ref}}.y)) * x_lerp;
-
     float bottom_x = __half2float(bottom_left{{half2_data_ref}}.x) + (__half2float(bottom_right{{half2_data_ref}}.x) - __half2float(bottom_left{{half2_data_ref}}.x)) * x_lerp;;
     float bottom_y = __half2float(bottom_left{{half2_data_ref}}.y) + (__half2float(bottom_right{{half2_data_ref}}.y) - __half2float(bottom_left{{half2_data_ref}}.y)) * x_lerp;;
+{% elif dtype == "float" %}
+    float top_x = top_left{{half2_data_ref}}.x + (top_right{{half2_data_ref}}.x - top_left{{half2_data_ref}}.x) * x_lerp;
+    float top_y = top_left{{half2_data_ref}}.y + (top_right{{half2_data_ref}}.y - top_left{{half2_data_ref}}.y) * x_lerp;
+    float bottom_x = bottom_left{{half2_data_ref}}.x + (bottom_right{{half2_data_ref}}.x - bottom_left{{half2_data_ref}}.x) * x_lerp;;
+    float bottom_y = bottom_left{{half2_data_ref}}.y + (bottom_right{{half2_data_ref}}.y - bottom_left{{half2_data_ref}}.y) * x_lerp;;
+{% endif %}
 
     float2 out = {0.f, 0.f};
     out.x = top_x + (bottom_x - top_x) * y_lerp;
     out.y = top_y + (bottom_y - top_y) * y_lerp;
 
+{% if dtype == "half" %}
     {% if bias_add %}
       output[out_idx] = __hadd2(__float22half2_rn(out), __ldg(input_res + out_idx));
     {% else %}
       output[out_idx] = __float22half2_rn(out);
     {% endif %}
+{% elif dtype == "float" %}
+    {% if bias_add %}
+      const auto tmp = __ldg(input_res + out_idx);
+      out.x += tmp.x;
+      out.y += tmp.y;
+    {% endif %}
+    output[out_idx] = out;
+{% endif %}
   }
 
 }
 
 {% else %}
 template <typename T, typename Telement, int element_in_Tio>
-__global__ void nearest_upsampling_f16_nhwc_kernel(const T* input,
+__global__ void nearest_upsampling_nhwc_kernel(const T* input,
                                                     {% if bias_add %}
                                                       const T* input_res,
                                                     {% endif %}
@@ -138,9 +159,9 @@
                                                     const {{index_type}} out_height,
                                                     const {{index_type}} out_width) {
 
-    const float height_scale = in_height / static_cast<float>(out_height);
-    const float width_scale = in_width / static_cast<float>(out_width);
-    const int64_t nthreads = out_height * out_width * channels * batch;
+  const float height_scale = in_height / static_cast<float>(out_height);
+  const float width_scale = in_width / static_cast<float>(out_width);
+  const int64_t nthreads = out_height * out_width * channels * batch;
 
 GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
@@ -171,7 +192,7 @@
     {% if tsize == 1 %}
     output[index] = input_val + input_res_val;
 
-    {% elif tsize == 8 %}
+    {% elif tsize == 8 and dtype == "half" %}
     T output_val;
     Telement* pack_y = reinterpret_cast<Telement*>(&output_val);
     Telement* pack_x = reinterpret_cast<Telement*>(&input_val);
@@ -220,16 +241,35 @@
     dim3 block(512);
 
 {% if mode == "bilinear" %}
-    bilinear_upsampling_f16_nhwc_kernel<<<grid, block, 0, stream>>>(
-      (const half2 *)input,
+    bilinear_upsampling_nhwc_kernel<<<grid, block, 0, stream>>>(
+      input,
       {% if bias_add %}
-        (const half2 *)input_res,
+        input_res,
       {% endif %}
-      (half2 *)output,
+      output,
       N, H, W, C/2, HO, WO);
 {% else %}
+  {% if dtype == "float" %}
     {% if tsize == 1 %}
-    nearest_upsampling_f16_nhwc_kernel<half, half, 1><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<float, float, 1><<<grid, block, 0, stream>>>(
+      (const float*)input,
+      {% if bias_add %}
+        (const float*)input_res,
+      {% endif %}
+      (float*)output,
+      N, H, W, C, HO, WO);
+    {% else %}
+    nearest_upsampling_nhwc_kernel<float2, float, 2><<<grid, block, 0, stream>>>(
+      (const float2*)input,
+      {% if bias_add %}
+        (const float2*)input_res,
+      {% endif %}
+      (float2*)output,
+      N, H, W, C / 2, HO, WO);
+    {% endif %}
+  {% else %}
+    {% if tsize == 1 %}
+    nearest_upsampling_nhwc_kernel<half, half, 1><<<grid, block, 0, stream>>>(
       (const half *)input,
       {% if bias_add %}
         (const half *)input_res,
@@ -237,7 +277,7 @@
       (half *)output,
       N, H, W, C, HO, WO);
     {% elif tsize == 8 %}
-    nearest_upsampling_f16_nhwc_kernel<float4, half, 8><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<float4, half, 8><<<grid, block, 0, stream>>>(
       (const float4 *)input,
       {% if bias_add %}
         (const float4 *)input_res,
@@ -245,7 +285,7 @@
       (float4 *)output,
       N, H, W, C/8, HO, WO);
     {% else %}
-    nearest_upsampling_f16_nhwc_kernel<half2, half, 2><<<grid, block, 0, stream>>>(
+    nearest_upsampling_nhwc_kernel<half2, half, 2><<<grid, block, 0, stream>>>(
       (const half2 *)input,
       {% if bias_add %}
         (const half2 *)input_res,
@@ -253,6 +293,7 @@
       (half2 *)output,
       N, H, W, C/2, HO, WO);
     {% endif %}
+  {% endif %}
 {% endif %}
 }
 } // namespace
@@ -272,7 +313,9 @@
     {{index_type}}* out_w,
     {{prefix}}Stream_t stream
 ) {
+
   {{shape_function}}
+
   {{exec_paths}}
   throw std::runtime_error(
       "Unsupported workload for this bilinear upsampling specialization."
diff --git a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
index fd0ca6c50..499be93b3 100644
--- a/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/efficient_nms_common.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from .efficient_nms_kernel import kernel
+from aitemplate.backend.common.vision_ops.efficient_nms_kernel import kernel
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
index 19f8bd6cd..d010e0b82 100644
--- a/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/multi_level_roi_align_common.py
@@ -24,12 +24,12 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}FPNRoiAlign<float, {{num_rois}}, {{pooled_size}}>(
-{{indent}}    in_ptr_p2,
-{{indent}}    in_ptr_p3,
-{{indent}}    in_ptr_p4,
-{{indent}}    in_ptr_p5,
-{{indent}}    rois_ptr,
-{{indent}}    out_ptr,
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p2),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p3),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p4),
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr_p5),
+{{indent}}    static_cast<{{elem_input_type}}*>(rois_ptr),
+{{indent}}    static_cast<{{elem_output_type}}*>(out_ptr),
 {{indent}}    batchSize,
 {{indent}}    featureCount,
 {{indent}}    imageSize,
@@ -142,10 +142,18 @@
 
   const Trois* roi = rois + 5 * (batch * roiCount + roiIdx);
   float hw;
+
+{% if elem_input_type == "half" %}
   float x1 = __half2float(roi[1]);
   float y1 = __half2float(roi[2]);
   float x2 = __half2float(roi[3]);
   float y2 = __half2float(roi[4]);
+{% elif elem_input_type == "float" %}
+  float x1 = roi[1];
+  float y1 = roi[2];
+  float x2 = roi[3];
+  float y2 = roi[4];
+{% endif %}
 
   y1 = max(0.f, min((float)imageSize.y, y1)) / imageSize.y;
   x1 = max(0.f, min((float)imageSize.x, x1)) / imageSize.x;
@@ -225,7 +233,12 @@
             interpolateBilinear(src, srcDims, ySample, xSample, featureCount);
       }
     }
-    *out = result / __float2half_rn(samplingCount);
+
+{% if elem_output_type == "half" %}
+    *out = __half(result) / __float2half_rn(samplingCount);
+{% elif elem_output_type == "float" %}
+    *out = result / samplingCount;
+{% endif %}
   }
 }
 
@@ -262,28 +275,28 @@
       roiCount,
       firstThreshold,
       samplingRatio,
-      (const half*)rois,
-      (const half*)P2,
+      reinterpret_cast<const {{elem_input_type}}*>(rois),
+      reinterpret_cast<const {{elem_input_type}}*>(P2),
       P2dims,
-      (const half*)P3,
+      reinterpret_cast<const {{elem_input_type}}*>(P3),
       P3dims,
-      (const half*)P4,
+      reinterpret_cast<const {{elem_input_type}}*>(P4),
       P4dims,
-      (const half*)P5,
+      reinterpret_cast<const {{elem_input_type}}*>(P5),
       P5dims,
-      (half*)output,
+      output,
       {pool_size, pool_size});
 }
 
 } // namespace
 
 void {{function_name}} (
-    {{elem_input_type}}* in_ptr_p2,
-    {{elem_input_type}}* in_ptr_p3,
-    {{elem_input_type}}* in_ptr_p4,
-    {{elem_input_type}}* in_ptr_p5,
-    {{elem_input_type}}* rois_ptr,
-    {{elem_output_type}}* out_ptr,
+    void* in_ptr_p2,
+    void* in_ptr_p3,
+    void* in_ptr_p4,
+    void* in_ptr_p5,
+    void* rois_ptr,
+    void* out_ptr,
     {{index_type}}* batch, {{index_type}}* in_ch,
     {{index_type}}* p2_h, {{index_type}}* p2_w,
     {{index_type}}* p3_h, {{index_type}}* p3_w,
@@ -317,12 +330,12 @@
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_input_type}}*,
-  {{elem_output_type}}*,
+  void*,
+  void*,
+  void*,
+  void*,
+  void*,
+  void*,
   {{index_type}}*,
   {{index_type}}*,
   {{index_type}}*,
@@ -348,12 +361,12 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p2}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p3}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p4}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{in_ptr_p5}}),
-{{indent}}    static_cast<{{elem_input_type}}*>({{rois_ptr}}),
-{{indent}}    static_cast<{{elem_output_type}}*>({{out_ptr}}),
+{{indent}}    {{in_ptr_p2}},
+{{indent}}    {{in_ptr_p3}},
+{{indent}}    {{in_ptr_p4}},
+{{indent}}    {{in_ptr_p5}},
+{{indent}}    {{rois_ptr}},
+{{indent}}    {{out_ptr}},
 {{indent}}    {{p_batch}},
 {{indent}}    {{p_in_ch}},
 {{indent}}    {{p2_h}}, {{p2_w}},
@@ -388,8 +401,8 @@ def gen_function_decl(func_attrs, backend_spec):
     """
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     return FUNC_DECL_TEMPLATE.render(
         index_type=backend_spec.index_type,
         prefix=backend_spec.prefix,
@@ -423,8 +436,8 @@ def gen_function_call(func_attrs, backend_spec, indent="  "):
     y = func_attrs["outputs"][0]
     yshape = y._attrs["shape"]
 
-    input_type = backend_spec.dtype_to_lib_type(p2._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(p2._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
diff --git a/python/aitemplate/backend/common/vision_ops/nms_common.py b/python/aitemplate/backend/common/vision_ops/nms_common.py
index 53e2b6f31..cf02e380a 100644
--- a/python/aitemplate/backend/common/vision_ops/nms_common.py
+++ b/python/aitemplate/backend/common/vision_ops/nms_common.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from .nms_kernel import KERNEL_TEMPLATE
+from aitemplate.backend.common.vision_ops.nms_kernel import KERNEL_TEMPLATE
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/common/vision_ops/nms_kernel.py b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
index 1eb8a51bd..fba468e28 100644
--- a/python/aitemplate/backend/common/vision_ops/nms_kernel.py
+++ b/python/aitemplate/backend/common/vision_ops/nms_kernel.py
@@ -223,6 +223,18 @@
   return interS / (Sa + Sb - interS);
 }
 
+__device__ __host__ inline float IoU(const Bbox<float>& a, const Bbox<float>& b) {
+  float left = fmaxf(a.xmin, b.xmin), right = fminf(a.xmax, b.xmax);
+  float top = fmaxf(a.ymin, b.ymin), bottom = fminf(a.ymax, b.ymax);
+  float width = fmaxf(right - left + 1.0f, 0.0f);
+  float height = fmaxf(bottom - top + 1.0f, 0.0f);
+  float interS = width * height;
+  float Sa = (a.xmax - a.xmin + 1.0f) * (a.ymax - a.ymin + 1.0f);
+  float Sb = (b.xmax - b.xmin + 1.0f) * (b.ymax - b.ymin + 1.0f);
+
+  return interS / (Sa + Sb - interS);
+}
+
 // NMS KERNEL FOR SMALL BATCH SIZE
 template <typename T_PROPOSALS, typename T_ROIS, int DIM, int TSIZE>
 __global__ __launch_bounds__(DIM) void nmsKernel1(
@@ -419,7 +431,7 @@
   }
 }
 
-// BBFilter KERNEL
+// BBFilter KERNEL half
 __global__ void bboxFilter_kernel(
     int N,
     const float minSize,
@@ -444,6 +456,27 @@
   }
 }
 
+// BBFilter KERNEL float
+__global__ void bboxFilter_kernel(
+    int N,
+    const float minSize,
+    const float* proposals,
+    float* scores) {
+  if (minSize == 0)
+    return;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tid < N) {
+    int ininf = 0xff800000;
+    float ninf = *(float*)&ininf;
+
+    if (proposals[tid * 4 + 2] - proposals[tid * 4 + 0] < minSize ||
+        proposals[tid * 4 + 3] - proposals[tid * 4 + 1] < minSize) {
+      scores[tid] = ninf;
+    }
+  }
+}
+
 inline size_t GetCudaAlignedSize(size_t size) {
   const size_t kCudaAlignSize = 1 << 20;
   return (size + kCudaAlignSize - 1) / kCudaAlignSize * kCudaAlignSize;
@@ -525,7 +558,7 @@ class MultiplyFunctor final {
   vworkspace = alignPtr(vworkspace, ALIGNMENT);
 
   std::size_t tempStorageBytes =
-      InferTempStorageForSortPairsDescending<half, int64_t>(N, R);
+      InferTempStorageForSortPairsDescending<T_ROIS, Bbox<T_ROIS>>(N, R);
 
   CSC({{prefix}}GetLastError(), STATUS_FAILURE);
 
diff --git a/python/aitemplate/backend/common/vision_ops/roi_align_common.py b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
index d7c64d60e..fb65b400a 100644
--- a/python/aitemplate/backend/common/vision_ops/roi_align_common.py
+++ b/python/aitemplate/backend/common/vision_ops/roi_align_common.py
@@ -23,10 +23,10 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}roi_align_launcher<{{library_dtype}}, float, {{num_rois}}, {{pooled_size}}>(
-{{indent}}    static_cast<const {{library_dtype}}*>(in_ptr),
-{{indent}}    static_cast<const {{library_dtype}}*>(rois_ptr),
-{{indent}}    static_cast<{{library_dtype}}*>(out_ptr),
+{{indent}}roi_align_launcher<{{dtype}}, float, {{num_rois}}, {{pooled_size}}>(
+{{indent}}    static_cast<const {{dtype}}*>(in_ptr),
+{{indent}}    static_cast<const {{dtype}}*>(rois_ptr),
+{{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
 {{indent}}    HI,
 {{indent}}    WI,
@@ -46,19 +46,20 @@
 SRC_TEMPLATE = jinja2.Template(
     """
 {{header_files}}
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
 
 namespace {
 #define CUDA_KERNEL_LOOP(i, n) \
   for (int64_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 
 template <typename T>
-__device__ float2 bilinear_interpolate(const half2* bottom_data,
-                                  const int height,
-                                  const int width,
-                                  T y,
-                                  T x,
-                                  const int channels,
-                                  const int index /* index for debug only*/) {
+__device__ float2 bilinear_interpolate(const {{vec_dtype}}* bottom_data,
+                                       const int height,
+                                       const int width,
+                                       T y,
+                                       T x,
+                                       const int channels,
+                                       const int index /* index for debug only*/) {
   // deal with cases that inverse elements are out of feature map boundary
   float2 val = {0.f, 0.f};
   if (y < -1.0 || y > height || x < -1.0 || x > width) {
@@ -87,11 +88,12 @@
   T lx = x - x_low;
   T hy = 1. - ly, hx = 1. - lx;
   // do bilinear interpolation
-  const half2  v1 = __ldg(bottom_data + (y_low * width + x_low) * channels);
-  const half2  v2 = __ldg(bottom_data + (y_low * width + x_high) * channels);
-  const half2  v3 = __ldg(bottom_data + (y_high * width + x_low) * channels);
-  const half2  v4 = __ldg(bottom_data + (y_high * width + x_high) * channels);
+  const {{vec_dtype}}  v1 = __ldg(bottom_data + (y_low * width + x_low) * channels);
+  const {{vec_dtype}}  v2 = __ldg(bottom_data + (y_low * width + x_high) * channels);
+  const {{vec_dtype}}  v3 = __ldg(bottom_data + (y_high * width + x_low) * channels);
+  const {{vec_dtype}}  v4 = __ldg(bottom_data + (y_high * width + x_high) * channels);
 
+{% if dtype == "half" %}
   T v1_x = __half2float(v1{{half2_data_ref}}.x);
   T v2_x = __half2float(v2{{half2_data_ref}}.x);
   T v3_x = __half2float(v3{{half2_data_ref}}.x);
@@ -101,6 +103,17 @@
   T v2_y = __half2float(v2{{half2_data_ref}}.y);
   T v3_y = __half2float(v3{{half2_data_ref}}.y);
   T v4_y = __half2float(v4{{half2_data_ref}}.y);
+{% elif dtype == "float" %}
+  T v1_x = v1{{half2_data_ref}}.x;
+  T v2_x = v2{{half2_data_ref}}.x;
+  T v3_x = v3{{half2_data_ref}}.x;
+  T v4_x = v4{{half2_data_ref}}.x;
+
+  T v1_y = v1{{half2_data_ref}}.y;
+  T v2_y = v2{{half2_data_ref}}.y;
+  T v3_y = v3{{half2_data_ref}}.y;
+  T v4_y = v4{{half2_data_ref}}.y;
+{% endif %}
 
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
@@ -111,19 +124,21 @@
 }
 
 template <typename T, int64_t num_rois, int pool_size>
-__global__ void roi_align_f16_nhwc_kernel(const half2* bottom_data,
-                                         const half* bottom_rois,
-                                         half2* top_data,
-                                         const int64_t N,
-                                         const int64_t height,
-                                         const int64_t width,
-                                         const int64_t channels,
-                                         const int64_t pooled_height,
-                                         const int64_t pooled_width,
-                                         const int sampling_ratio,
-                                         const float spatial_scale,
-                                         const bool position_sensitive,
-                                         const bool continuous_coordinate) {
+__global__ void roi_align_nhwc_kernel(const {{dtype}}* bottom_data_raw,
+                                      const {{dtype}}* bottom_rois,
+                                      {{dtype}}* top_data_raw,
+                                      const int64_t N,
+                                      const int64_t height,
+                                      const int64_t width,
+                                      const int64_t channels,
+                                      const int64_t pooled_height,
+                                      const int64_t pooled_width,
+                                      const int sampling_ratio,
+                                      const float spatial_scale,
+                                      const bool position_sensitive,
+                                      const bool continuous_coordinate) {
+  const {{vec_dtype}}* bottom_data = reinterpret_cast<const {{vec_dtype}}*>(bottom_data_raw);
+  {{vec_dtype}}* top_data = reinterpret_cast<{{vec_dtype}}*>(top_data_raw);
 
   const int64_t nthreads = num_rois * channels * pooled_width * pooled_height;
 
@@ -139,21 +154,36 @@
     const int n = idx / pooled_height;
 
 
-    const half* offset_bottom_rois = bottom_rois + n * 5;
+    const {{dtype}}* offset_bottom_rois = bottom_rois + n * 5;
+  {% if dtype == "half" %}
     int roi_batch_ind = static_cast<int>(__half2float(offset_bottom_rois[0]));
+  {% elif dtype == "float" %}
+    int roi_batch_ind = static_cast<int>(offset_bottom_rois[0]);
+  {% endif %}
 
     float2 output_val = {0.f, 0.f};
     if (roi_batch_ind < 0) {
+  {% if dtype == "half" %}
       top_data[index] = __float22half2_rn(output_val);
+  {% elif dtype == "float" %}
+      top_data[index] = output_val;
+  {% endif %}
       continue;
     }
 
     // Do not using rounding; this implementation detail is critical
     T roi_offset  = continuous_coordinate ? static_cast<T>(0.5) : static_cast<T>(0);
+  {% if dtype == "half" %}
     T roi_start_w = __half2float(offset_bottom_rois[1]) * spatial_scale - roi_offset;
     T roi_start_h = __half2float(offset_bottom_rois[2]) * spatial_scale - roi_offset;
     T roi_end_w   = __half2float(offset_bottom_rois[3]) * spatial_scale - roi_offset;
     T roi_end_h   = __half2float(offset_bottom_rois[4]) * spatial_scale - roi_offset;
+  {% elif dtype == "float" %}
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale - roi_offset;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale - roi_offset;
+    T roi_end_w   = offset_bottom_rois[3] * spatial_scale - roi_offset;
+    T roi_end_h   = offset_bottom_rois[4] * spatial_scale - roi_offset;
+  {% endif %}
 
     T roi_width  = roi_end_w - roi_start_w;
     T roi_height = roi_end_h - roi_start_h;
@@ -172,7 +202,7 @@
       channels_unpooled = channels * pooled_height * pooled_width;
     }
 
-    const half2* offset_bottom_data =
+    const {{vec_dtype}}* offset_bottom_data =
            bottom_data + (roi_batch_ind * height * width * channels_unpooled + c_unpooled);
 
     // We use roi_bin_grid to sample the grid and mimic integral
@@ -200,7 +230,11 @@
     output_val.x /= count;
     output_val.y /= count;
 
+  {% if dtype == "half" %}
     top_data[index] = __float22half2_rn(output_val);
+  {% elif dtype == "float" %}
+    top_data[index] = output_val;
+  {% endif %}
   }
 
 }
@@ -212,10 +246,10 @@
 }
 
 
-template <typename LibraryT, typename T, int64_t num_rois, int pool_size>
-void roi_align_launcher(const LibraryT* input,
-                        const LibraryT* rois,
-                        LibraryT* output,
+template <typename ElemT, typename T, int64_t num_rois, int pool_size>
+void roi_align_launcher(const ElemT* input,
+                        const ElemT* rois,
+                        ElemT* output,
                       const {{index_type}} N,
                       const {{index_type}} H,
                       const {{index_type}} W,
@@ -235,8 +269,8 @@
       static_cast<int64_t>(4096)));
   dim3 block(512);
 
-  roi_align_f16_nhwc_kernel<T, num_rois, pool_size><<<grid, block, 0, stream>>>(
-    (const half2*)input, (const half*)rois, (half2*)output, N, H, W, C / 2, HO, WO,
+  roi_align_nhwc_kernel<T, num_rois, pool_size><<<grid, block, 0, stream>>>(
+    input, rois, output, N, H, W, C / 2, HO, WO,
     sampling_ratio, spatial_scale, position_sensitive, continuous_coordinate);
 
 }
diff --git a/python/aitemplate/backend/cuda/__init__.py b/python/aitemplate/backend/cuda/__init__.py
index f2ff7c11f..ac88e3679 100644
--- a/python/aitemplate/backend/cuda/__init__.py
+++ b/python/aitemplate/backend/cuda/__init__.py
@@ -16,23 +16,31 @@
 """
 CUDA backend codegen functions.
 """
-from . import cuda_common, lib_template, target_def, utils
-from .common import *
-from .conv2d import *
-from .conv3d import *
-from .elementwise import *
-from .embedding import *
-from .gemm_special import *
-from .gemm_universal import *
-from .gemm_epilogue_vistor import *
-from .layernorm_sigmoid_mul import *
-from .padding import *
-from .pool2d import *
-from .reduce import *
-from .softmax import *
-from .tensor import *
-from .upsample import *
-from .view_ops import *
-from .vision_ops import *
-from .attention import *
-from .groupnorm import *
+from aitemplate.backend.cuda import (
+    builder_cmake,
+    cuda_common,
+    lib_template,
+    target_def,
+    utils,
+)
+from aitemplate.backend.cuda.common import *
+from aitemplate.backend.cuda.conv2d import *
+from aitemplate.backend.cuda.conv3d import *
+from aitemplate.backend.cuda.elementwise import *
+from aitemplate.backend.cuda.embedding import *
+from aitemplate.backend.cuda.gemm_special import *
+from aitemplate.backend.cuda.gemm_universal import *
+from aitemplate.backend.cuda.gemm_epilogue_vistor import *
+from aitemplate.backend.cuda.jagged import *
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import *
+from aitemplate.backend.cuda.padding import *
+from aitemplate.backend.cuda.pool2d import *
+from aitemplate.backend.cuda.reduce import *
+from aitemplate.backend.cuda.softmax import *
+from aitemplate.backend.cuda.tensor import *
+from aitemplate.backend.cuda.upsample import *
+from aitemplate.backend.cuda.view_ops import *
+from aitemplate.backend.cuda.vision_ops import *
+from aitemplate.backend.cuda.attention import *
+from aitemplate.backend.cuda.groupnorm import *
+from aitemplate.backend.cuda.b2b_bmm import *
diff --git a/python/aitemplate/backend/cuda/attention/__init__.py b/python/aitemplate/backend/cuda/attention/__init__.py
index 9636980b4..c57effeee 100644
--- a/python/aitemplate/backend/cuda/attention/__init__.py
+++ b/python/aitemplate/backend/cuda/attention/__init__.py
@@ -15,6 +15,6 @@
 """
 cuda flash_attention module init
 """
-from . import flash_attention, mem_eff_attention
+from aitemplate.backend.cuda.attention import flash_attention, mem_eff_attention
 
 __all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/backend/cuda/attention/flash_attention.py b/python/aitemplate/backend/cuda/attention/flash_attention.py
index 55d781ceb..b53eb419e 100644
--- a/python/aitemplate/backend/cuda/attention/flash_attention.py
+++ b/python/aitemplate/backend/cuda/attention/flash_attention.py
@@ -19,7 +19,7 @@
 
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
index 3948182d2..dafe5bdee 100644
--- a/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
+++ b/python/aitemplate/backend/cuda/attention/mem_eff_attention.py
@@ -13,38 +13,62 @@
 #  limitations under the License.
 #
 """
-attention kernel codegen for CUDA.
+Attention kernel codegen for CUDA.
 """
 from typing import Any, Dict
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
-FUNC_TEMPLATE = jinja2.Template(
+CUDA_CHECK = """
+#ifndef CUDA_CHECK_ME_ATTN
+#define CUDA_CHECK_ME_ATTN(expr, msg)                                          \\
+  do {                                                                         \\
+    cudaError_t status = (expr);                                               \\
+    if (status != cudaSuccess) {                                               \\
+      std::cerr << msg << " at " << __FILE__ << ": " << __LINE__ << std::endl; \\
+      throw std::runtime_error(cudaGetErrorString(status));                    \\
+    }                                                                          \\
+  } while (0)
+#endif // CUDA_CHECK_ME_ATTN
+"""
+
+FUNC_TEMPLATE_KERNEL_FWD = jinja2.Template(
     """
 #include <iostream>
 #include <cuda_fp16.h>
 #include "cutlass/cutlass.h"
-#include "kernel_forward.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+
+#include "mem_eff_attention/gemm_kernel_utils.h"
+#include "mem_eff_attention/kernel_forward.h"
+
+
+using namespace gemm_kernel_utils;
+
+{{cuda_check}}
 
 {{func_signature}}
 {
 
     /*
+    The code is based on fused_multihead_attention_fixed_seqlen.cu example in CUTLASS repo:
+    https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
+
     problem_sizes0 [b, m, n, k]
     [head_number * batch_size, m, mkv, k0]
-    [head_number * batch_size, seq_length, seq_length_kv, head_size]
+    [head_number * batch_size, seq_length_q, seq_length_kv, head_size]
 
     problem_sizes1
     [head_number * batch_size, m, k1, mkv]
-    [head_number * batch_size, seq_length, head_size_v, seq_length_kv]
+    [head_number * batch_size, seq_length_q, head_size_v, seq_length_kv]
 
-    m = seq_len
-    n = seq_len
+    m = seq_len_q
+    n = seq_len_kv
     k = head_size
 
     Q: B, M, K
@@ -52,11 +76,11 @@
     P: B, M, N
     V: B, N, K
     O: B, M, K
-    output: bs, num_head, seq_len, head_size
+    output: bs, seq_len_q, num_head, head_size
     */
 
 
-    using ArchTag = cutlass::arch::Sm80;
+    using ArchTag = cutlass::arch::Sm{{arch}};
     constexpr bool kIs64x64 = {{kIs64x64}};
     constexpr bool kSingleValueIteration = {{kSingleValueIteration}};
 
@@ -76,16 +100,33 @@
         std::cerr << "WARNING: you will get better performance with `kSingleValueIteration=true` (keeps the output in RF rather than GMEM)";
     }
 
+    using GemmType = DefaultGemmType<ArchTag, {{elem_input_type}}>;
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            {{elem_input_type}},
+            {{elem_input_type}},
+            {{elem_input_type}}, // ElementC
+            float // ElementAccumulator
+            >;
+
+    // If the head_size already meets the alignment requirement, then
+    // it's safe to mark mem_align to be true to maximize the alignment
+    // benefit. Otherwise, assign false to it to use the minimal alignment.
+    constexpr const bool mem_align =
+        ({{head_size}} % DefaultConfig::kAlignmentA == 0) &&
+        ({{head_size}} % DefaultConfig::kAlignmentB == 0);
     using Attention = AttentionKernel<
         {{elem_input_type}}, // scalar_t
         ArchTag,
-        true, // memory is aligned
+        mem_align, // memory is aligned
         kQueriesPerBlock,
         kKeysPerBlock,
         kSingleValueIteration
     >;
 
-    int block_O_size = (*batch_size) * seq_len * num_heads * head_size_v;
     typename Attention::Params p;
     {
         // set parameters
@@ -94,8 +135,16 @@
         p.value_ptr = static_cast<{{elem_input_type}}*>(value);
         p.logsumexp_ptr = nullptr; // Only needed for bw
         p.output_accum_ptr = nullptr;
+
+        if (!fixed_seq_length_q) {
+            p.seqlens_q_ptr = lengths_q;
+        }
+        if (!fixed_seq_length_kv) {
+            p.seqlens_k_ptr = lengths_kv;
+        }
+
         if (Attention::kNeedsOutputAccumulatorBuffer) {
-          p.output_accum_ptr = accum_ptr;
+          p.output_accum_ptr = static_cast<float*>(workspace);
         }
         p.output_ptr = static_cast<{{elem_input_type}}*>(output);
 
@@ -103,8 +152,8 @@
         p.num_batches = *batch_size;
         p.head_dim = head_size;
         p.head_dim_value = head_size_v;
-        p.num_queries = seq_len;
-        p.num_keys = seq_len_kv;
+        p.num_queries = *seq_len_q;
+        p.num_keys = *seq_len_kv;
         p.causal = is_causal;
 
 
@@ -112,14 +161,14 @@
         p.k_strideM = head_size;
         p.v_strideM = head_size_v;
 
-        p.q_strideH = p.q_strideM * seq_len;
-        p.k_strideH = p.k_strideM * seq_len_kv;
-        p.v_strideH = p.v_strideM * seq_len_kv;
+        p.q_strideH = p.q_strideM * (*seq_len_q);
+        p.k_strideH = p.k_strideM * (*seq_len_kv);
+        p.v_strideH = p.v_strideM * (*seq_len_kv);
         p.o_strideH = head_size_v;
         p.q_strideB = p.q_strideH * num_heads;
         p.k_strideB = p.k_strideH * num_heads;
         p.v_strideB = p.v_strideH * num_heads;
-        p.o_strideB = head_size_v * seq_len * num_heads;
+        p.o_strideB = head_size_v * (*seq_len_q) * num_heads;
     }
 
     // launch kernel
@@ -129,10 +178,11 @@
       cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
     }
     if (!Attention::check_supported(p)) {
-      std::cerr << "Kernel does not support these inputs" << std::endl;
-      return;
+      std::string error_msg = std::string("Got error: kernel does not support these inputs") +
+           " at " + __FILE__ + ": " + std::to_string(__LINE__);
+      throw std::runtime_error(error_msg);
     }
-    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
 
     cudaError_t err = cudaDeviceSynchronize();
 
@@ -146,22 +196,467 @@
 )
 
 
+FUNC_TEMPLATE_GROUPED_FMHA = jinja2.Template(
+    """
+#include <vector>
+#include <iostream>
+#include <cuda_fp16.h>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/fast_math.h"
+
+#include "mem_eff_attention/gemm_kernel_utils.h"
+#include "mem_eff_attention/default_fmha_grouped.h"
+
+using namespace gemm_kernel_utils;
+
+{{cuda_check}}
+
+{{func_signature}}
+
+{
+  /*
+  The code is based on fused_multihead_attention_variable_seqlen.cu example in CUTLASS repo:
+  https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
+
+  problem_sizes0 [b, m, n, k]
+  [head_number * batch_size, mq, mkv, k0]
+  [head_number * batch_size, seq_length_q, seq_length_kv, head_size]
+
+  problem_sizes1
+  [head_number * batch_size, mq, k1, mkv]
+  [head_number * batch_size, seq_length_q, head_size_v, seq_length_kv]
+
+  m = seq_len_q
+  n = seq_len_kv
+  k = head_size
+
+  Q: B, M, K
+  K: B, N, K
+  P: B, M, N
+  V: B, N, K
+  O: B, M, K
+  output: bs, seq_len_q, num_head, head_size
+
+  Note that the output shape is different from the CUTLASS example.
+  */
+  //
+  int problem_count = (*batch_size) * num_heads;
+
+  /////// Calculate offsets of FMHA arguments in the workspace //////
+
+  int used_memory = 0;
+  // Space for problem sizes for each problem
+  int size_problem_sizes = sizeof(cutlass::gemm::GemmCoord) * problem_count;
+  cutlass::gemm::GemmCoord* problem_sizes_device0 =
+      static_cast<cutlass::gemm::GemmCoord*>(workspace + used_memory);
+  used_memory += size_problem_sizes;
+  cutlass::gemm::GemmCoord* problem_sizes_device1 =
+      static_cast<cutlass::gemm::GemmCoord*>(workspace + used_memory);
+  used_memory += size_problem_sizes;
+  // Space for leading dimensions of tensors in each problem
+  int size_ld = sizeof(int64_t) * problem_count;
+  int64_t* ldq = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldk = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldv = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+  int64_t* ldo = static_cast<int64_t*>(workspace + used_memory);
+  used_memory += size_ld;
+
+  using ArchTag = cutlass::arch::Sm{{arch}};
+  constexpr bool kIs64x64 = {{kIs64x64}};
+  constexpr bool kSingleValueIteration = {{kSingleValueIteration}};
+
+  // Set grid size
+  constexpr int64_t kQueriesPerBlock = kIs64x64 ? 64 : 32;
+  constexpr int64_t kKeysPerBlock = kIs64x64 ? 64 : 128;
+  if (kIs64x64 && head_size_v > kKeysPerBlock) {
+    std::cerr
+        << "WARNING: you will get better performance with `kIs64x64=false`";
+  }
+  if (kSingleValueIteration && head_size_v > kKeysPerBlock) {
+    std::cerr << "ERROR  : Use kSingleValueIteration to keep output in RF. "
+                 "This requires to have `head_size <= kKeysPerBlock` "
+                 "but head_size_v="
+              << head_size_v << " and kKeysPerBlock=" << kKeysPerBlock << "";
+    return;
+  }
+  if (!kSingleValueIteration && head_size_v <= kKeysPerBlock) {
+    std::cerr
+        << "WARNING: you will get better performance with `kSingleValueIteration=true` (keeps the output in RF rather than GMEM)";
+  }
+
+  using GemmType = DefaultGemmType<ArchTag, {{elem_input_type}}>;
+  using OpClass = typename GemmType::OpClass;
+  using DefaultConfig =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          OpClass,
+          ArchTag,
+          {{elem_input_type}},
+          {{elem_input_type}},
+          {{elem_input_type}}, // ElementC
+          float // ElementAccumulator
+          >;
+
+  // If the head_size already meets the alignment requirement, then
+  // it's safe to mark mem_align to be true to maximize the alignment
+  // benefit. Otherwise, assign false to it to use the minimal alignment.
+  constexpr const bool mem_align = ({{head_size}} % DefaultConfig::kAlignmentA == 0) &&
+      ({{head_size}} % DefaultConfig::kAlignmentB == 0);
+
+  cutlass::gemm::kernel::GroupScheduleMode const GroupScheduleMode_ =
+      cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly;
+
+  using AttentionKernel = typename cutlass::gemm::kernel::DefaultFMHAGrouped<
+      {{elem_input_type}}, // scalar_t
+      ArchTag,
+      mem_align,
+      kQueriesPerBlock,
+      kKeysPerBlock,
+      kSingleValueIteration,
+      GroupScheduleMode_>::FMHAKernel;
+  using Attention = cutlass::gemm::device::GemmGrouped<AttentionKernel>;
+
+  if (({{head_size}} % AttentionKernel::kAlignmentQ != 0) ||
+      ({{head_size}} % AttentionKernel::kAlignmentK != 0)) {
+    std::cerr << "Error at " << __FILE__ << ": " << __LINE__ <<
+        "head_size not aligned! head_size has to be divisible by " <<
+        std::to_string(AttentionKernel::kAlignmentQ) << " and " <<
+        std::to_string(AttentionKernel::kAlignmentK) + ", but got {{head_size}}."
+        << std::endl;
+    return;
+  }
+
+  // If we need a separate buffer for output accumulation
+  static bool const kNeedsOutputAccumulatorBuffer =
+      Attention::GemmKernel::kNeedsOutputAccumulatorBuffer;
+
+  // Problem sizes with actual sequence lengths
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+  // Problem sizes with "full" sequence lengths
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_full;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_full;
+
+  problem_sizes0.reserve(problem_count);
+  problem_sizes1.reserve(problem_count);
+  problem_sizes0_full.reserve(problem_count);
+  problem_sizes1_full.reserve(problem_count);
+
+  // Copy sequence lengths from device to host, if they are not fixed
+  std::vector<int> mq_real_buf; // Target sequence lengths
+  std::vector<int> mkv_real_buf; // Source sequence lengths
+  if (!fixed_seq_length_q) {
+    mq_real_buf.resize(*batch_size);
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        mq_real_buf.data(), lengths_q, *batch_size * sizeof(int), cudaMemcpyDeviceToHost, stream),
+      "Error when copying target sequence lengths from device!");
+  }
+  if (!fixed_seq_length_kv) {
+    mkv_real_buf.resize(*batch_size);
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        mkv_real_buf.data(), lengths_kv,  *batch_size * sizeof(int), cudaMemcpyDeviceToHost, stream),
+        "Error when copying source sequence lengths from device!");
+  }
+  if (!fixed_seq_length_q || !fixed_seq_length_kv) {
+    CUDA_CHECK_ME_ATTN(cudaStreamSynchronize(stream),
+          "Error when synchronizing stream after copying sequence lengths from device!");
+  }
+
+  int mq_full = *seq_len_q;
+  int mkv_full = *seq_len_kv;
+
+  for (int i = 0; i < *batch_size; ++i) {
+    // Problems belonging to the same batch share the same seq len
+    // Source sequence length
+    int mkv_real = fixed_seq_length_kv ? mkv_full : mkv_real_buf.at(i);
+    // Target sequence length
+    int mq_real = fixed_seq_length_q ? mq_full : mq_real_buf.at(i);
+
+    int k0 = head_size;
+    int k1 = head_size_v;
+
+    // Create sizes of two GEMM problems for each of batch_size * num_heads attention problems
+    for (int j = 0; j < num_heads; ++j) {
+      cutlass::gemm::GemmCoord problem0(mq_real, mkv_real, k0);
+      cutlass::gemm::GemmCoord problem1(mq_real, k1, mkv_real);
+      problem_sizes0.push_back(problem0);
+      problem_sizes1.push_back(problem1);
+
+      cutlass::gemm::GemmCoord problem0_full(mq_full, mkv_full, k0);
+      cutlass::gemm::GemmCoord problem1_full(mq_full, k1, mkv_full);
+      problem_sizes0_full.push_back(problem0_full);
+      problem_sizes1_full.push_back(problem1_full);
+    }
+  }
+
+  // Move problem sizes to the device
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      problem_sizes_device0,
+      problem_sizes0.data(),
+      size_problem_sizes,
+      cudaMemcpyHostToDevice,
+      stream),
+    "Error when copying problem sizes 0 to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      problem_sizes_device1,
+      problem_sizes1.data(),
+      size_problem_sizes,
+      cudaMemcpyHostToDevice,
+      stream),
+    "Error when copying problem sizes 1 to device!");
+
+  // Offsets of input, buffer, and output matrices in memory
+  std::vector<int64_t> offset_Q_full;
+  std::vector<int64_t> offset_K_full;
+  std::vector<int64_t> offset_V_full;
+  std::vector<int64_t> offset_O_full;
+
+  // Leading dimensions of matrices of each problem
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  ldq_host.resize(problem_count);
+  ldk_host.resize(problem_count);
+  ldv_host.resize(problem_count);
+  ldo_host.resize(problem_count);
+
+  using scalar_t = typename Attention::GemmKernel::scalar_t;
+  using accum_t = typename Attention::GemmKernel::accum_t;
+  using output_t = typename Attention::GemmKernel::output_t;
+  using output_accum_t = typename Attention::GemmKernel::output_accum_t;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementAccumulator = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+
+  // Arrays of pointers to matrices for each problem
+  int size_ptrs = sizeof(ElementQ*) * problem_count;
+  ElementQ** ptr_Q = static_cast<ElementQ**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementK** ptr_K = static_cast<ElementK**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementV** ptr_V = static_cast<ElementV**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementO** ptr_O = static_cast<ElementO**>(workspace + used_memory);
+  used_memory += size_ptrs;
+  ElementOAccum** ptr_O_accumulate =
+      static_cast<ElementOAccum**>(workspace + used_memory);
+  used_memory += size_ptrs;
+
+  int64_t total_elements_Q_full = 0;
+  int64_t total_elements_K_full = 0;
+  int64_t total_elements_V_full = 0;
+  //int64_t total_elements_O_full = 0;
+  int64_t total_elements_O_at_batch_start = 0;
+
+  // Pointers to matrices and leading dimensions for each problem are first
+  // formed on the host and then copied to the device.
+
+  for (int32_t i_batch = 0; i_batch < *batch_size; ++i_batch) {
+    int64_t total_elements_O_in_current_batch = 0;
+    for (int32_t i_heads = 0; i_heads < num_heads; ++i_heads) {
+      int64_t i = i_batch * num_heads + i_heads;
+      auto problem0 = problem_sizes0.at(i);
+      auto problem1 = problem_sizes1.at(i);
+
+      auto problem0_full = problem_sizes0_full.at(i);
+      auto problem1_full = problem_sizes1_full.at(i);
+
+      /*
+      Below we specify leading dimensions of each matix, assuming the following
+      layouts and dimensions:
+
+      using LayoutQ = cutlass::layout::RowMajor;
+      using LayoutK = cutlass::layout::ColumnMajor;
+      using LayoutV = cutlass::layout::RowMajor;
+      using LayoutO = cutlass::layout::RowMajor;
+
+      ldq_host.at(i) = LayoutQ::packed({problem0.m(), problem0.k()}).stride(0);
+      ldk_host.at(i) = LayoutK::packed({problem0.k(), problem0.n()}).stride(0);
+      ldv_host.at(i) = LayoutV::packed({problem1.k(), problem1.n()}).stride(0);
+      ldo_host.at(i) = LayoutO::packed({problem1.m(), problem1.n()}).stride(0);
+      */
+
+      ldq_host.at(i) = problem0.k(); // K, rowmajor
+      ldk_host.at(i) = problem0.k(); // K, columnmajor
+      ldv_host.at(i) = problem1.n(); // K, rowmajor
+      // Since we want output in shape [b, seq_len_q, num_head, head_size] and
+      // not [b, num_head, seq_len_q, head_size], ldo is different from the
+      // CUTLASS example. Each next row of O is now separated from the previous
+      // one by head_size * num_heads, instead of just head_size.
+      ldo_host.at(i) = problem1.n() * num_heads; // K * num_heads, rowmajor
+
+      offset_Q_full.push_back(total_elements_Q_full);
+      offset_K_full.push_back(total_elements_K_full);
+      offset_V_full.push_back(total_elements_V_full);
+      // To write the output in shape [b, seq_len_q, num_head, head_size]
+      // instead of [b, num_head, seq_len_q, head_size], we place rows of O
+      // from the same batch but different heads at stride head_size from
+      // each other (and not seq_len_q * head_size).
+      offset_O_full.push_back(
+          total_elements_O_at_batch_start + i_heads * problem1_full.n());
+
+      int64_t elements_Q_full = problem0_full.m() * problem0_full.k();
+      int64_t elements_K_full = problem0_full.k() * problem0_full.n();
+      int64_t elements_V_full = problem1_full.k() * problem1_full.n();
+      int64_t elements_O_full = problem1_full.m() * problem1_full.n();
+
+      total_elements_Q_full += elements_Q_full;
+      total_elements_K_full += elements_K_full;
+      total_elements_V_full += elements_V_full;
+      total_elements_O_in_current_batch += elements_O_full;
+    }
+    total_elements_O_at_batch_start += total_elements_O_in_current_batch;
+  }
+
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldq, ldq_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of Q matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldk, ldk_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of K matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldv, ldv_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of V matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(ldo, ldo_host.data(), size_ld, cudaMemcpyHostToDevice, stream),
+    "Error when copying leading dimensions of O matrices to device!");
+
+  // Buffer for output accumulation, if necessary
+  float* accum_ptr = static_cast<float*>(workspace + used_memory);
+
+  std::vector<ElementQ*> ptr_Q_host(problem_count);
+  std::vector<ElementK*> ptr_K_host(problem_count);
+  std::vector<ElementV*> ptr_V_host(problem_count);
+  std::vector<ElementO*> ptr_O_host(problem_count);
+  std::vector<ElementOAccum*> ptr_O_accumulate_host(problem_count);
+
+  for (int32_t i = 0; i < problem_count; ++i) {
+    ptr_Q_host.at(i) = static_cast<ElementQ*>(query) + offset_Q_full.at(i);
+    ptr_K_host.at(i) = static_cast<ElementK*>(key) + offset_K_full.at(i);
+    ptr_V_host.at(i) = static_cast<ElementV*>(value) + offset_V_full.at(i);
+    ptr_O_host.at(i) = static_cast<ElementO*>(output) + offset_O_full.at(i);
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      ptr_O_accumulate_host.at(i) =
+        static_cast<ElementOAccum*>(accum_ptr) + offset_O_full.at(i);
+    }
+  }
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_Q, ptr_Q_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to Q matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_K, ptr_K_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to K matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_V, ptr_V_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to V matrices to device!");
+  CUDA_CHECK_ME_ATTN(
+    cudaMemcpyAsync(
+      ptr_O, ptr_O_host.data(), size_ptrs, cudaMemcpyHostToDevice, stream),
+    "Error when copying pointers to O matrices to device!");
+
+  if (kNeedsOutputAccumulatorBuffer) {
+    CUDA_CHECK_ME_ATTN(
+      cudaMemcpyAsync(
+        ptr_O_accumulate,
+        ptr_O_accumulate_host.data(),
+        size_ptrs,
+        cudaMemcpyHostToDevice,
+        stream),
+      "Error when copying pointers to accumulator buffers to device!");
+  }
+
+  int threadblock_count =
+      Attention::sufficient(problem_sizes1.data(), problem_count);
+  typename Attention::Arguments args(
+      problem_sizes_device0,
+      problem_sizes_device1,
+      problem_count,
+      threadblock_count,
+      ptr_Q,
+      ptr_K,
+      nullptr, // ptr_P isn't used by grouped FMHA
+      ptr_V,
+      ptr_O,
+      ptr_O_accumulate,
+      ldq,
+      ldk,
+      nullptr, // ldp isn't used by grouped FMHA
+      ldv,
+      ldo,
+      is_causal,
+      problem_sizes1.data());
+
+  Attention fmha;
+  cutlass::Status status = fmha.initialize(args, nullptr, stream);
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Failed to initialize CUTLASS Grouped FMHA kernel."
+              << std::endl;
+    return;
+  }
+
+  // Run the grouped FMHA object
+  status = fmha.run(stream);
+  if (status != cutlass::Status::kSuccess) {
+    std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+    return;
+  }
+}
+
+    """
+)
+
+
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(void* output,
                    void* query,
                    void* key,
                    void* value,
-                   float* accum_ptr,
                    int64_t* batch_size,
-                   int seq_len,
-                   int seq_len_kv,
+                   int64_t* seq_len_kv,
+                   int64_t* seq_len_q,
                    int num_heads,
                    int head_size,
                    int head_size_v,
                    float p_dropout,
                    float softmax_scale,
                    bool is_causal,
+                   bool fixed_seq_length_kv,
+                   int32_t* lengths_kv,
+                   bool fixed_seq_length_q,
+                   int32_t* lengths_q,
+                   void* workspace,
                    cudaStream_t stream)
     """
 )
@@ -177,16 +672,21 @@
 {{indent}}{{func_name}}(
 {{indent}}    {{output}},
 {{indent}}    {{query}}, {{key}}, {{value}},
-{{indent}}    {{accum_ptr}},
 {{indent}}    {{batch_size}},
-{{indent}}    {{seq_len}},
 {{indent}}    {{seq_len_kv}},
+{{indent}}    {{seq_len_q}},
 {{indent}}    {{num_heads}},
 {{indent}}    {{head_size}},
 {{indent}}    {{head_size_v}},
 {{indent}}    {{p_dropout}},
 {{indent}}    {{softmax_scale}},
-{{indent}}    {{is_causal}}, stream /* default stream */
+{{indent}}    {{is_causal}},
+{{indent}}    {{fixed_seq_length_kv}},
+{{indent}}    {{lengths_kv}},
+{{indent}}    {{fixed_seq_length_q}},
+{{indent}}    {{lengths_q}},
+{{indent}}    global_workspace_,
+{{indent}}    {{stream}}
 {{indent}});
     """
 )
@@ -199,11 +699,19 @@ def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    return FUNC_TEMPLATE.render(
+    if func_attrs["use_grouped_fmha"]:
+        func_template = FUNC_TEMPLATE_GROUPED_FMHA
+    else:
+        func_template = FUNC_TEMPLATE_KERNEL_FWD
+
+    return func_template.render(
         elem_input_type=elem_input_type,
+        head_size=func_attrs["head_size"],
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         kIs64x64="true" if func_attrs["head_size"] <= 64 else "false",
         kSingleValueIteration="true" if func_attrs["head_size"] <= 128 else "false",
+        cuda_check=CUDA_CHECK,
+        arch=func_attrs["arch"],
     )
 
 
@@ -219,7 +727,7 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     """the function for generating a function call for attention"""
     output_name = ""
     assert len(func_attrs["outputs"]) == 1
-    assert len(func_attrs["inputs"]) == 3
+    assert len(func_attrs["inputs"]) in [3, 4, 5]
 
     output_name = func_attrs["outputs"][0]._attrs["name"]
 
@@ -227,10 +735,24 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     k_name = func_attrs["inputs"][1]._attrs["name"]
     v_name = func_attrs["inputs"][2]._attrs["name"]
 
+    variable_seq_length_kv = func_attrs["variable_seq_length_kv"]
+    variable_seq_length_q = func_attrs["variable_seq_length_q"]
+
+    lengths_name_kv = "nullptr"
+    lengths_name_q = "nullptr"
+
+    if variable_seq_length_kv:
+        assert len(func_attrs["inputs"]) > 3
+        lengths_name_kv = func_attrs["inputs"][3]._attrs["name"]
+    if variable_seq_length_q:
+        idx_len_q = 3 + variable_seq_length_kv
+        assert len(func_attrs["inputs"]) > idx_len_q
+        lengths_name_q = func_attrs["inputs"][idx_len_q]._attrs["name"]
+
     x = func_attrs["inputs"][0]
     xshape = x._attrs["shape"]
     batch_size = "&" + xshape[0]._attrs["name"]
-    seq_len = x._attrs["shape"][2]._attrs["values"][0]
+    seq_len_q = "&" + xshape[2]._attrs["name"]
 
     num_heads = x._attrs["shape"][1]._attrs["values"][0]
     head_size = x._attrs["shape"][3]._attrs["values"][0]
@@ -239,24 +761,32 @@ def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
     softmax_scale = head_size ** (-0.5)
 
     v = func_attrs["inputs"][2]
-    seq_len_kv = v._attrs["shape"][2]._attrs["values"][0]
+    vshape = v._attrs["shape"]
+    seq_len_kv = "&" + vshape[2]._attrs["name"]
+
     head_size_v = v._attrs["shape"][3]._attrs["values"][0]
 
+    backend_spec = CUDASpec()
+
     return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
         func_name=func_attrs["name"],
         output=output_name,
         query=q_name,
         key=k_name,
         value=v_name,
-        accum_ptr="reinterpret_cast<float*>(global_workspace_)",
         batch_size=batch_size,
-        seq_len=seq_len,
         seq_len_kv=seq_len_kv,
+        seq_len_q=seq_len_q,
         num_heads=num_heads,
         head_size=head_size,
         head_size_v=head_size_v,
         p_dropout=p_dropout,
         softmax_scale=softmax_scale,
         is_causal="true" if is_causal else "false",
+        fixed_seq_length_kv="false" if variable_seq_length_kv else "true",
+        lengths_kv=f"static_cast<int32_t*>({lengths_name_kv})",
+        fixed_seq_length_q="false" if variable_seq_length_q else "true",
+        lengths_q=f"static_cast<int32_t*>({lengths_name_q})",
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/__init__.py b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
new file mode 100644
index 000000000..ed65514f5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/__init__.py
@@ -0,0 +1,26 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+
+"""
+b2b bmm module init
+"""
+
+from aitemplate.backend.cuda.b2b_bmm import (
+    classic_b2b_bmm,
+    fmha_style_b2b_bmm,
+    grouped_classic_b2b_bmm,
+    grouped_fmha_style_b2b_bmm,
+)
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
new file mode 100644
index 000000000..5f6ab7c0e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/classic_b2b_bmm.py
@@ -0,0 +1,384 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+classic_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "classic_b2b_bmm/device/b2b_batched_gemm.h"
+
+namespace {
+
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int ThreadblockM = 64;
+constexpr int ThreadblockK = 32;
+constexpr int WarpK = 32;
+constexpr int InstructionM = 16;
+constexpr int InstructionN = 8;
+constexpr int InstructionK = 16;
+
+// Currently, causal mask is only supported with warp shape M of 16.
+// While we ought to debug it, the warp shape M restriction is not considered
+// high-priority as we do not want to make warp M much larger anyway. If you want
+// to explore the perf-impact of tuning this, then you can turn off causal mask after
+// gemm 0 and see what the perf result is.
+constexpr int WarpM = 16;
+
+constexpr int N0 = {{n0}};
+constexpr int N1 = {{n1}};
+
+void check_status(cutlass::Status status, int64_t m0, int64_t k0, const std::string& message) {
+  if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error(
+        message +
+        "Function: {{function_name}}. "
+        "m0: " + std::to_string(m0) +
+        ", k0: " + std::to_string(k0) +
+        ", n0: " + std::to_string({{n0}}) +
+        ", n1: " + std::to_string({{n1}}) + "."
+      );
+  }
+  return;
+}
+
+}  // end namespace
+
+{{func_signature}} {
+  using ElementInput = {{elem_input_type}};
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+  using ElementCompute = {{elem_accum_type}};
+
+  ElementCompute alpha0 = ElementCompute({{alpha0}});
+  ElementCompute beta0 = ElementCompute(1);
+  ElementCompute activation_alpha = ElementCompute({{alpha1}});
+  {% if alpha1_divide_by_seq_len %}
+  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(m0));
+  {% endif %}
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<ThreadblockM, N0, ThreadblockK>;
+  using WarpShape0 = cutlass::gemm::GemmShape<WarpM, N0, WarpK>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<ThreadblockM, N1, ThreadblockK>;
+  using WarpShape1 = cutlass::gemm::GemmShape<WarpM, N1, WarpK>;
+  using InstructionShape = cutlass::gemm::GemmShape<InstructionM, InstructionN, InstructionK>;
+
+  using EpilogueOutputOp0 =
+    cutlass::epilogue::thread::LinearCombinationGeneric<
+      {{epilogue_math}},
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      // Saves a little time in the epilogue by not multiplying the source by beta.
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  using EpilogueOutputOp1 =
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::Nothing
+    >;
+
+  using B2bGemmBatched = cutlass::gemm::device::B2bGemmBatched<
+    ElementInput,
+    cutlass::layout::RowMajor,
+    ElementInput,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3,
+    {{has_causal}} // enable causal mask after gemm0
+  >;
+
+  cutlass::gemm::GemmCoord problem_size_0(m0, {{n0}}, k0);
+  cutlass::gemm::GemmCoord problem_size_1(m0, {{n1}}, {{n0}});
+
+  // Assuming BMHD dim ordering for inputs and outputs, like in FHMA style op
+  // B = batch size
+  // M = sequence len
+  // H = num heads
+  // D = embedding dims per head
+  // --- Tensor shapes:
+  // GEMM PROBLEM 0:
+  // A=query : [ batch_size, M0, num_heads, K0 ]
+  // B=key : [ batch_size, N0, num_heads, K0 ]
+  // C0=bias : [ batch_size, num_heads, M0, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
+  // GEMM PROBLEM 1:
+  // B1=value : [ batch_size, K1==N0, num_heads, N1 ]
+  // C1=unused:  [ N1 ]
+  // D1=output : [ batch_size, M1==M0, num_heads, N1 ]
+
+  // Required equalities for B2B gemm:
+  // M1 = M0;
+  // K1 = N0;
+
+  typename B2bGemmBatched::Arguments arguments{
+    problem_size_0, // = GemmCoord problem_size_0;
+    problem_size_1, // = GemmCoord problem_size_1;
+    {static_cast<ElementInput*>(query), typename B2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementA const, LayoutA> ref_A0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_A0;
+    num_heads * problem_size_0.m() * problem_size_0.k(),                                                                // int64_t batch_stride_A0;
+    {static_cast<ElementInput*>(key), typename B2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},        // TensorRef<ElementB const, LayoutB> ref_B0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_B0;
+    num_heads * problem_size_0.n() * problem_size_0.k(),                                                                // int64_t batch_stride_B0;
+    {static_cast<ElementInput*>(bias), typename B2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                    // TensorRef<ElementC const, LayoutC> ref_C0;
+    {{bias_stride_mn}},                                                                                                 // int64_t head_stride_C0;
+    {{bias_stride_hmn}},                                                                                                // int64_t batch_stride_C0;
+    {static_cast<ElementInput*>(value), typename B2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},     // TensorRef<ElementC const, LayoutC> ref_B1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_B1;                                                                    //
+    num_heads * problem_size_1.n() * problem_size_1.k(),                                                                // int64_t batch_stride_B1;
+    {static_cast<ElementInput*>(nullptr), typename B2bGemmBatched::LayoutScaleBias::Stride(0)},                         // Not used due to ScaleType::Nothing for output op 1
+    0,                                                                                                                  // not used: int64_t head_stride_C1;
+    0,                                                                                                                  // not used: int64_t batch_stride_C1;
+    {static_cast<ElementOutput*>(output), typename B2bGemmBatched::LayoutC::Stride(num_heads * problem_size_1.n())},    // TensorRef<ElementC, LayoutC> ref_D1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_output;
+    num_heads * problem_size_1.m() * problem_size_1.n(),                                                                // int64_t batch_stride_output;
+    batch_size,                                                                                                         // int batch_count;
+    num_heads,                                                                                                          // int num_heads
+    {alpha0, beta0, activation_alpha},                                                                                  // typename EpilogueOutputOp0::Params epilogue0;
+    {alpha1, beta1},                                                                                                    // typename EpilogueOutputOp1::Params epilogue1;
+  };
+
+  B2bGemmBatched b2b_gemm_op;
+  check_status(
+    b2b_gemm_op.can_implement(arguments),
+    m0, k0,
+    "Problem sizes are not supported."
+  );
+  check_status(
+    b2b_gemm_op.initialize(arguments),
+    m0, k0,
+    "classic_b2b_bmm initialization failed!"
+  );
+  check_status(
+    b2b_gemm_op(stream),
+    m0, k0,
+    "classic_b2b_bmm failed to execute!"
+  );
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   void* query,
+                   void* key,
+                   void* value,
+                   void* bias,
+                   int64_t batch_size,
+                   int64_t num_heads,
+                   int64_t m0,
+                   int64_t k0,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{m0}},
+{{indent}}    {{k0}},
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.classic_b2b_bmm.gen_function")
+def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v, bias = func_attrs["inputs"]
+    seq_len_dim = 1
+    n0 = k._attrs["shape"][seq_len_dim]
+    n1 = v._attrs["shape"][-1]
+    if not isinstance(n0, IntImm) or not isinstance(n1, IntImm):
+        raise RuntimeError(
+            f"n0 and n1 must be static dims. {func_attrs['name']=}, {n0=}, {n1=}"
+        )
+
+    supported_types = ("float16", "bfloat16")
+    input_type = func_attrs["inputs"][0]._attrs["dtype"]
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    if input_type not in supported_types or output_type not in supported_types:
+        raise NotImplementedError(
+            f"{supported_types=} for inputs and output "
+            f"but got {input_type=} and {output_type=}."
+        )
+
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(input_type)
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
+
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+        and input_type == "float16"
+    ):
+        elem_accum_type = "cutlass::half_t"
+    else:
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    epilogue_math = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    bias_shape = bias._attrs["shape"]
+    bias_broadcast = [s == IntImm(1) for s in bias_shape]
+    if len(bias_broadcast) == 3:
+        # single head case: Add num heads dimension of size 1
+        bias_broadcast = [bias_broadcast[0], True, bias_broadcast[1], bias_broadcast[2]]
+    assert (
+        len(bias_broadcast) == 4
+    ), f"Bias shape should be of length 4, got {len(bias_broadcast)=}"
+
+    # Calculate stride expressions for bias tensor
+    # Last dimension of bias has implicit stride of 1,
+    # so cannot be broadcasted over
+    bias_stride_n = "problem_size_0.n()"
+    bias_shape_expr = [bias_stride_n]
+
+    # build stride expressions
+    if not bias_broadcast[-2]:
+        bias_shape_expr.append("problem_size_0.m()")
+    bias_stride_mn = "*".join(bias_shape_expr)
+    if not bias_broadcast[-3]:
+        bias_shape_expr.append("num_heads")
+    bias_stride_hmn = "*".join(bias_shape_expr)  # batch stride
+
+    # Strides for broadcasted dimensions are zero
+    if bias_broadcast[0]:  # query sequence len stride
+        bias_stride_hmn = "0"
+    if bias_broadcast[1]:  # head stride
+        bias_stride_mn = "0"
+    if bias_broadcast[2]:  # query sequence length stride
+        bias_stride_n = "0"
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        n0=str(n0.value()),
+        n1=str(n1.value()),
+        has_causal=(
+            "true" if func_attrs["causal_type"] != CausalType.NO_CAUSAL else "false"
+        ),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        epilogue_math=epilogue_math,
+        bias_stride_n=bias_stride_n,
+        bias_stride_mn=bias_stride_mn,
+        bias_stride_hmn=bias_stride_hmn,
+    )
+
+
+@registry.reg("cuda.classic_b2b_bmm.func_decl")
+def classic_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.classic_b2b_bmm.func_call")
+def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 4
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+    bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+
+    batch_size = q_shape[0]._attrs["name"]
+    seq_len_dim = 1
+    head_dim = -2
+    m0 = q_shape[seq_len_dim]._attrs["name"]
+
+    if len(q_shape) == 3:
+        # single head case
+        k0 = q_shape[2]._attrs["name"]
+        num_heads = "1"
+    elif len(q_shape) == 4:
+        k0 = q_shape[3]._attrs["name"]
+        num_heads = q_shape[head_dim]._attrs["name"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        batch_size=batch_size,
+        num_heads=num_heads,
+        m0=m0,
+        k0=k0,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..4f794fd13
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/fmha_style_b2b_bmm.py
@@ -0,0 +1,319 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+fmha_style_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+
+#include "fmha_style_b2b_bmm/kernel_forward.h"
+
+namespace {
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int kQueriesPerBlock = 64;
+constexpr int kKeysPerBlock = ({{head_dim_value}} <= 64 ? 64 : 128);
+constexpr bool kSingleValueIteration = ({{head_dim_value}} <= kKeysPerBlock);
+}  // end namespace
+
+{{func_signature}} {
+  using ElementInput = {{elem_input_type}};
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+
+  using Attention = AttentionKernel<
+    ElementInput,
+    ElementAccumulator,
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kSingleValueIteration,
+    {{activation_functor}},
+    {{offset_t}}
+  >;
+
+  ElementAccumulator alpha0 = ElementAccumulator({{alpha0}});
+  ElementAccumulator alpha1 = ElementAccumulator({{alpha1}});
+
+  int64_t head_dim = {{head_dim}};
+  int64_t head_dim_value = {{head_dim_value}};
+
+  typename Attention::Params p;
+  { // set parameters
+    p.query_ptr = static_cast<ElementInput*>(query);
+    p.key_ptr = static_cast<ElementInput*>(key);
+    p.value_ptr = static_cast<ElementInput*>(value);
+    if (bias) {
+      p.attn_bias_ptr = static_cast<ElementInput*>(bias);
+    }
+    p.output_accum_ptr = nullptr;
+    if (Attention::kNeedsOutputAccumulatorBuffer) {
+      p.output_accum_ptr = reinterpret_cast<ElementAccumulator*>(accum_ptr);
+    }
+    p.output_ptr = static_cast<ElementOutput*>(output);
+
+    p.scale = alpha0;
+    p.activation_scale = alpha1;
+    p.activation_scale_divide_by_seq_len = {{alpha1_divide_by_seq_len}};
+
+    p.num_heads = num_heads;
+    p.num_batches = batch_size;
+
+    p.head_dim = head_dim;
+    p.head_dim_value = head_dim_value;
+    p.seq_length = seq_length;
+    p.num_queries = seq_length;
+    p.num_keys = seq_length_kv;
+    p.causal_type = Attention::Params::{{causal_type}};
+
+    // All tensors are in BMHK shapes
+    p.q_strideH = head_dim;
+    p.k_strideH = head_dim;
+    p.v_strideH = head_dim_value;
+
+    p.q_strideM = p.q_strideH * p.num_heads;
+    p.k_strideM = p.k_strideH * p.num_heads;
+    p.v_strideM = p.v_strideH * p.num_heads;
+
+    p.q_strideB = p.q_strideM * seq_length;
+    p.k_strideB = p.k_strideM * seq_length_kv;
+    p.v_strideB = p.v_strideM * seq_length_kv;
+
+    int32_t bias_stride = seq_length_kv;
+    {% if bias_broadcast[2] %}
+    p.bias_strideM = 0;
+    {% else %}
+    p.bias_strideM = bias_stride;
+    bias_stride *= seq_length;
+    {% endif %}
+
+    {% if bias_broadcast[1] %}
+    p.bias_strideH = 0;
+    {% else %}
+    p.bias_strideH = bias_stride;
+    bias_stride *= p.num_heads;
+    {% endif %}
+
+    {% if bias_broadcast[0] %}
+    p.bias_strideB = 0;
+    {% else %}
+    p.bias_strideB = bias_stride;
+    {% endif %}
+
+    p.offset_ptr = static_cast<const {{offset_t}}*>({{offset_ptr}});
+  }
+
+  // launch kernel :)
+  constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+  int smem_bytes = sizeof(typename Attention::SharedStorage);
+  if (smem_bytes > 0xc000) {
+    auto result = cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    auto error_code = cudaGetLastError();
+    if (result != cudaSuccess) {
+        throw std::runtime_error(
+            "Failed to set attribute! Error: " + std::string(cudaGetErrorString(error_code)) +
+            ", error code: " + std::to_string(error_code)
+        );
+    }
+  }
+  if (!Attention::check_supported(p)) {
+    throw std::runtime_error(
+      std::string("Kernel does not support these inputs. ") +
+      "Function: {{func_name}}. " +
+      "seq_length: " + std::to_string(seq_length) +
+      ", head_dim: " + std::to_string({{head_dim}}) +
+      ", seq_length_kv: " + std::to_string(seq_length_kv) +
+      ", head_dim_value: " + std::to_string({{head_dim_value}}) + "."
+    );
+  }
+  kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(
+  void* output,
+  void* query,
+  void* key,
+  void* value,
+  void* bias,
+  void* accum_ptr,
+  int64_t batch_size,
+  int64_t seq_length,
+  int64_t seq_length_kv,
+  int64_t num_heads,
+  cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+{{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}},
+{{indent}}    {{key}},
+{{indent}}    {{value}},
+{{indent}}    {{bias}},
+{{indent}}    {{accum_ptr}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{seq_length}},
+{{indent}}    {{seq_length_kv}},
+{{indent}}    {{num_heads}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+def causal_type_to_kernel_str(causal_type: CausalType) -> str:
+    if causal_type == CausalType.NO_CAUSAL:
+        return "CausalType::NO_CAUSAL"
+    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
+        return "CausalType::UPPER_RIGHT_EMPTY"
+    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
+        return "CausalType::LOWER_LEFT_EMPTY"
+    else:
+        raise RuntimeError(f"Unsupported causal type {causal_type=}")
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.gen_function")
+def fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v = func_attrs["inputs"][0:3]
+
+    bias_broadcast = [False] * 4
+    if len(func_attrs["inputs"]) > 3:
+        bias = func_attrs["inputs"][3]
+        bias_broadcast = [var == IntImm(1) for var in bias.shape()]
+
+    k0 = k._attrs["shape"][3]
+    n1 = v._attrs["shape"][3]
+    if not isinstance(k0, IntImm) or not isinstance(n1, IntImm):
+        raise RuntimeError(
+            f"k0 and n1 must be static dims. {func_attrs['name']=}, {k0=}, {n1=}"
+        )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+        and elem_input_type == "cutlass::half_t"
+    ):
+        elem_accum_type = "cutlass::half_t"
+    else:
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    activation_functor = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        offset_t="int64_t",
+        head_dim=str(k0.value()),
+        head_dim_value=str(n1.value()),
+        causal_type=causal_type_to_kernel_str(func_attrs["causal_type"]),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        activation_functor=activation_functor,
+        bias_broadcast=bias_broadcast,
+        offset_ptr="nullptr",
+    )
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.func_decl")
+def fmha_style_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.fmha_style_b2b_bmm.func_call")
+def fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1, f"{len(func_attrs['outputs'])=} != 1"
+    assert len(func_attrs["inputs"]) in (
+        3,
+        4,
+    ), f"{len(func_attrs['inputs'])=} != 3 or 4"
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    bias_name = "nullptr"
+    if len(func_attrs["inputs"]) == 4:
+        bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    k_shape = func_attrs["inputs"][1]._attrs["shape"]
+    batch_size = q_shape[0]._attrs["name"]
+    seq_length = q_shape[1]._attrs["name"]
+    seq_length_kv = k_shape[1]._attrs["name"]
+    num_heads = q_shape[2]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        accum_ptr="global_workspace_",
+        batch_size=batch_size,
+        seq_length=seq_length,
+        seq_length_kv=seq_length_kv,
+        num_heads=num_heads,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..cf5c3c521
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -0,0 +1,391 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+classic_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+
+# pylint: disable=C0301
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+#include "cutlass/gemm/device/gemm.h"
+
+#include "grouped_classic_b2b_bmm/device/b2b_batched_gemm.h"
+
+namespace {
+
+// Hardcode these sizes for now until we get profiling ready.
+constexpr int ThreadblockM = 128; // changed from 64 due to improved performance. More leads to errors.
+constexpr int ThreadblockK = 32;
+constexpr int WarpK = 32;
+constexpr int InstructionM = 16;
+constexpr int InstructionN = 8;
+constexpr int InstructionK = 16;
+
+// Currently, causal mask is only supported with warp shape M of 16.
+// While we ought to debug it, the warp shape M restriction is not considered
+// high-priority as we do not want to make warp M much larger anyway. If you want
+// to explore the perf-impact of tuning this, then you can turn off causal mask after
+// gemm 0 and see what the perf result is.
+constexpr int WarpM = 16;
+
+constexpr int N0 = {{max_seq_len}};
+constexpr int N1 = {{n1}}; // embedding size of value
+
+void check_status(cutlass::Status status, int64_t max_seq_len, int64_t k0, const std::string& message) {
+  if (status != cutlass::Status::kSuccess) {
+      throw std::runtime_error(
+        message +
+        "Function: {{function_name}}. "
+        "max_seq_len: " + std::to_string(max_seq_len) +
+        ", k0: " + std::to_string(k0) +
+        ", n0: " + std::to_string({{max_seq_len}}) +
+        ", n1: " + std::to_string({{n1}}) + "."
+      );
+  }
+  return;
+}
+
+}  // end namespace
+
+{{func_signature}} {
+  using ElementOutput = {{elem_output_type}};
+  using ElementAccumulator = {{elem_accum_type}};
+  using ElementCompute = {{elem_input_type}};
+
+  ElementCompute alpha0 = ElementCompute({{alpha0}});
+  ElementCompute beta0 = ElementCompute(1);
+  ElementCompute activation_alpha = ElementCompute({{alpha1}});
+  {% if alpha1_divide_by_seq_len %}
+  activation_alpha = activation_alpha / (ElementCompute)(static_cast<int32_t>(max_seq_len));
+  {% endif %}
+  ElementCompute alpha1 = ElementCompute(1);
+  ElementCompute beta1 = ElementCompute(0);
+
+  using ThreadblockShape0 = cutlass::gemm::GemmShape<ThreadblockM, N0, ThreadblockK>;
+  using WarpShape0 = cutlass::gemm::GemmShape<WarpM, N0, WarpK>;
+  using ThreadblockShape1 = cutlass::gemm::GemmShape<ThreadblockM, N1, ThreadblockK>;
+  using WarpShape1 = cutlass::gemm::GemmShape<WarpM, N1, WarpK>;
+  using InstructionShape = cutlass::gemm::GemmShape<InstructionM, InstructionN, InstructionK>;
+
+  using EpilogueOutputOp0 =
+    cutlass::epilogue::thread::LinearCombinationGeneric<
+      {{epilogue_math}},
+      ElementOutput,
+      InstructionShape::kM * InstructionShape::kN / 32,
+      ElementAccumulator,
+      ElementCompute,
+      // Saves a little time in the epilogue by not multiplying the source by beta.
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling
+    >;
+
+  using EpilogueOutputOp1 =
+    cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      128 / cutlass::sizeof_bits<ElementOutput>::value,
+      ElementAccumulator,
+      ElementCompute,
+      cutlass::epilogue::thread::ScaleType::Nothing
+    >;
+
+  using GroupedB2bGemmBatched = cutlass::gemm::device::GroupedB2bGemmBatched<
+    ElementCompute,
+    cutlass::layout::RowMajor,
+    ElementCompute,
+    cutlass::layout::ColumnMajor,
+    cutlass::layout::RowMajor,
+    ElementOutput,
+    cutlass::layout::RowMajor,
+    ElementAccumulator,
+    cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm80,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    3,
+    {{has_causal}}, // enable causal mask after gemm0
+    false, // stage accumulator in shared memory
+    {{offset_type}}
+  >;
+
+  cutlass::gemm::GemmCoord problem_size_0({{max_seq_len}}, {{max_seq_len}}, k0);
+  cutlass::gemm::GemmCoord problem_size_1({{max_seq_len}}, {{n1}}, {{max_seq_len}});
+
+  // Assuming BMHD dim ordering for inputs and outputs, like in FHMA style op
+  // B = batch size
+  // M = sequence len
+  // H = num heads
+  // D = embedding dims per head
+  // --- Tensor shapes:
+  // GEMM PROBLEM 0:
+  // jagged dims seq_len is in extra <...> brackets with the batch size
+
+  // A=query : [ <batch_size, M0=jagged_seq_len>, num_heads, K0 ]
+  // B=key : [ <batch_size, N0=jagged_seq_len>, num_heads, K0 ]
+  // C0=bias : [ batch_size, num_heads, max_seq_len, N0 ] # Where the batch size, head and M0 dimension may be broadcasted over
+  // GEMM PROBLEM 1:
+  // B1=value : [ <batch_size, K1=jagged_seq_len>, num_heads, N1 ]
+  // C1=unused:  [ N1 ]
+  // D1=output : [ <batch_size, M1=jagged_seq_len>, num_heads, N1 ]
+
+  // Required equalities for grouped / jagged B2B gemm:
+  // seq_len = M1 = M0 = N0 = K1;
+
+
+  typename GroupedB2bGemmBatched::Arguments arguments{
+    problem_size_0, // = GemmCoord problem_size_0;
+    problem_size_1, // = GemmCoord problem_size_1;
+    {static_cast<ElementCompute*>(query), typename GroupedB2bGemmBatched::LayoutA::Stride(num_heads * problem_size_0.k())},    // TensorRef<ElementA const, LayoutA> ref_A0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_A0;
+    num_heads * problem_size_0.m() * problem_size_0.k(),                                                                // int64_t batch_stride_A0;
+    {static_cast<ElementCompute*>(key), typename GroupedB2bGemmBatched::LayoutB::Stride(num_heads * problem_size_0.k())},      // TensorRef<ElementB const, LayoutB> ref_B0;
+    problem_size_0.k(),                                                                                                 // int64_t head_stride_B0;
+    num_heads * problem_size_0.n() * problem_size_0.k(),                                                                // int64_t batch_stride_B0;
+    {static_cast<ElementCompute*>(bias), typename GroupedB2bGemmBatched::LayoutC::Stride({{bias_stride_n}})},                  // TensorRef<ElementC const, LayoutC> ref_C0;
+    {{bias_stride_mn}},                                                                                                 // int64_t head_stride_C0;
+    {{bias_stride_hmn}},                                                                                                // int64_t batch_stride_C0;
+    {static_cast<ElementCompute*>(value), typename GroupedB2bGemmBatched::LayoutB1::Stride(num_heads * problem_size_1.n())},   // TensorRef<ElementC const, LayoutC> ref_B1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_B1;                                                                    //
+    num_heads * problem_size_1.n() * problem_size_1.k(),                                                                // int64_t batch_stride_B1;
+    {static_cast<ElementCompute*>(nullptr), typename GroupedB2bGemmBatched::LayoutScaleBias::Stride(0)},                       // Not used due to ScaleType::Nothing for output op 1
+    0,                                                                                                                  // not used: int64_t head_stride_C1;
+    0,                                                                                                                  // not used: int64_t batch_stride_C1;
+    {static_cast<ElementOutput*>(output), typename GroupedB2bGemmBatched::LayoutC::Stride(num_heads * problem_size_1.n())},    // TensorRef<ElementC, LayoutC> ref_D1;
+    problem_size_1.n(),                                                                                                 // int64_t head_stride_output;
+    num_heads * problem_size_1.m() * problem_size_1.n(),                                                                // int64_t batch_stride_output;
+    batch_size,                                                                                                         // int batch_count;
+    num_heads,                                                                                                          // int num_heads
+    static_cast<const {{offset_type}}*>(offsets),                                                                                       // const offset_t *
+    {alpha0, beta0, activation_alpha},                                                                                  // typename EpilogueOutputOp0::Params epilogue0;
+    {alpha1, beta1},                                                                                                    // typename EpilogueOutputOp1::Params epilogue1;
+  };
+
+  GroupedB2bGemmBatched b2b_gemm_op;
+  check_status(
+    b2b_gemm_op.can_implement(arguments),
+    {{max_seq_len}}, k0,
+    "Problem sizes are not supported."
+  );
+  check_status(
+    b2b_gemm_op.initialize(arguments),
+    {{max_seq_len}}, k0,
+    "classic_b2b_bmm initialization failed!"
+  );
+  check_status(
+    b2b_gemm_op(stream),
+    {{max_seq_len}}, k0,
+    "classic_b2b_bmm failed to execute!"
+  );
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   void* query,
+                   void* key,
+                   void* value,
+                   void* bias,
+                   int64_t batch_size,
+                   int64_t num_heads,
+                   int64_t max_seq_len,
+                   int64_t k0,
+                   const void *offsets,
+                   cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{max_seq_len}},
+{{indent}}    {{k0}},
+{{indent}}    {{offsets}},
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.gen_function")
+def classic_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v, bias = func_attrs["inputs"]
+    max_seq_len = func_attrs["max_seq_len"]
+    n1 = v._attrs["shape"][-1]
+    jagged_intvar = q._attrs["shape"][0]
+    if not isinstance(n1, IntImm):
+        raise RuntimeError(f"n1 must be static dim. {func_attrs['name']=}, {n1=}")
+    backend_spec = CUDASpec()
+    if func_attrs["inputs"][0]._attrs["dtype"] != "float16":
+        raise NotImplementedError(
+            "only float16 dtype supported for now in classic_b2b_bmm op"
+        )
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    if (
+        "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        elem_accum_type = "cutlass::half_t"
+    else:
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    epilogue_math = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+
+    bias_shape = bias._attrs["shape"]
+    bias_broadcast = [s == IntImm(1) for s in bias_shape]
+    if len(bias_broadcast) == 3:
+        # single head case: Add num heads dimension of size 1
+        bias_broadcast = [bias_broadcast[0], True, bias_broadcast[1], bias_broadcast[2]]
+    assert (
+        len(bias_broadcast) == 4
+    ), f"Bias shape should be of length 4, got {len(bias_broadcast)=}"
+
+    # Calculate stride expressions for bias tensor
+    # Last dimension of bias has implicit stride of 1,
+    # so cannot be broadcasted over
+    bias_stride_n = "problem_size_0.n()"
+    bias_shape_expr = [bias_stride_n]
+
+    # build stride expressions
+    if not bias_broadcast[-2]:
+        bias_shape_expr.append("problem_size_0.m()")
+    bias_stride_mn = "*".join(bias_shape_expr)
+    if not bias_broadcast[-3]:
+        bias_shape_expr.append("num_heads")
+    bias_stride_hmn = "*".join(bias_shape_expr)  # batch stride
+
+    # Strides for broadcasted dimensions are zero
+    if bias_broadcast[0]:  # query sequence len stride
+        bias_stride_hmn = "0"
+    if bias_broadcast[1]:  # head stride
+        bias_stride_mn = "0"
+    if bias_broadcast[2]:  # query sequence length stride
+        bias_stride_n = "0"
+
+    return FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        max_seq_len=str(max_seq_len),
+        n1=str(n1.value()),
+        has_causal=(
+            "true" if func_attrs["causal_type"] != CausalType.NO_CAUSAL else "false"
+        ),
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        epilogue_math=epilogue_math,
+        bias_stride_n=bias_stride_n,
+        bias_stride_mn=bias_stride_mn,
+        bias_stride_hmn=bias_stride_hmn,
+        offset_type=jagged_intvar.offsets_type(),
+    )
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.func_decl")
+def classic_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.grouped_classic_b2b_bmm.func_call")
+def classic_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) == 4
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+    bias_name = func_attrs["inputs"][3]._attrs["name"]
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+
+    k0 = q_shape[2]._attrs["name"]
+
+    jagged_intvar = q_shape[0]
+    batch_size_var = jagged_intvar.batch_dim()._attrs["name"]
+    if len(jagged_intvar.jagged_dims()) != 1:
+        raise RuntimeError(
+            "Only support 1 jagged dim in grouped_classic_b2b_bmm for now! "
+            f"Current jagged intvar: {jagged_intvar}"
+        )
+    max_seq_len_dim = jagged_intvar.jagged_dims()[0].max_value()
+    max_seq_len_var = (
+        str(max_seq_len_dim.value())
+        if isinstance(max_seq_len_dim, IntImm)
+        else max_seq_len_dim._attrs["name"]
+    )
+    num_heads_var = q_shape[1]._attrs["name"]
+    offsets_var_name = f"{jagged_intvar.offsets_var_name()}.data[0]"
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        batch_size=batch_size_var,
+        num_heads=num_heads_var,
+        max_seq_len=max_seq_len_var,
+        k0=k0,
+        indent=indent,
+        offsets=offsets_var_name,
+        offset_type=jagged_intvar.offsets_type(),
+    )
diff --git a/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..46f6e310f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -0,0 +1,202 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+grouped_fmha_style_b2b_bmm kernel codegen for CUDA.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.b2b_bmm import fmha_style_b2b_bmm
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm
+
+from ... import registry
+
+# pylint: disable=C0301
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(
+  void* output,
+  void* query,
+  void* key,
+  void* value,
+  void* bias,
+
+  // Used as an internal cache to compute output values when the output is too
+  // large to be computed in a single iteration.
+  void* accum_ptr,
+
+  int64_t batch_size,
+
+  // Max sequence lengths of the query, key and values.
+  // This kernel always assumes that seq_length == seq_length_kv.
+  int64_t seq_length,
+  int64_t seq_length_kv,
+
+  int64_t num_heads,
+
+  // A pointer to the offset of the variable sequence lengths
+  // of the query and key tensors.
+  // e.g. when batch_size=4, seq_length is [2, 1, 4, 5]
+  // offset array is [0, 2, 3, 7, 12].
+  const void* offset,
+
+  cudaStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+{{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    {{query}}, {{key}}, {{value}}, {{bias}},
+{{indent}}    {{accum_ptr}},
+{{indent}}    {{batch_size}},
+{{indent}}    {{seq_length}},
+{{indent}}    {{seq_length_kv}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{offset}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.gen_function")
+def grouped_fmha_style_b2b_bmm_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    q, k, v = func_attrs["inputs"][0:3]
+
+    bias_broadcast = [False] * 4
+    if len(func_attrs["inputs"]) > 3:
+        bias = func_attrs["inputs"][3]
+        bias_broadcast = [var == IntImm(1) for var in bias.shape()]
+
+    jagged_dim = q._attrs["shape"][0]
+    head_dim = q._attrs["shape"][2]
+    head_dim_value = v._attrs["shape"][2]
+    if not isinstance(head_dim, IntImm) or not isinstance(head_dim_value, IntImm):
+        raise RuntimeError(
+            f"head_dim and head_dim_value must be static dims. {func_attrs['name']=}, {head_dim=}, {head_dim_value=}"
+        )
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_accum_type = elem_input_type
+    if (
+        elem_input_type == "cutlass::half_t"
+        and "use_fp16_acc" in Target.current()._kwargs
+        and not Target.current()._kwargs["use_fp16_acc"]
+    ):
+        elem_accum_type = "float"
+
+    import cutlass_lib
+
+    activation_functor = cutlass_lib.library.EpilogueMathTag[
+        cutlass_lib.library.EpilogueMathName[func_attrs["epilogue_math_name"]]
+    ]
+    return fmha_style_b2b_bmm.FUNC_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        elem_accum_type=elem_accum_type,
+        offset_t=jagged_dim.offsets_type(),
+        seq_length="max_seq_length",
+        seq_length_kv="max_seq_length",
+        head_dim=str(head_dim.value()),
+        head_dim_value=str(head_dim_value.value()),
+        causal_type=fmha_style_b2b_bmm.causal_type_to_kernel_str(
+            func_attrs["causal_type"]
+        ),
+        num_heads="num_heads",
+        alpha0=str(func_attrs["alpha0"]),
+        alpha1=str(func_attrs["alpha1"]),
+        alpha1_divide_by_seq_len="true"
+        if func_attrs["alpha1_divide_by_seq_len"]
+        else "false",
+        activation_functor=activation_functor,
+        bias_broadcast=bias_broadcast,
+        offset_ptr="offset",
+    )
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.func_decl")
+def grouped_fmha_style_b2b_bmm_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("cuda.grouped_fmha_style_b2b_bmm.func_call")
+def grouped_fmha_style_b2b_bmm_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) in (3, 4)
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    bias_name = "nullptr"
+    if len(func_attrs["inputs"]) == 4:
+        bias_name = func_attrs["inputs"][3]._attrs["name"]
+
+    q_shape = func_attrs["inputs"][0]._attrs["shape"]
+    jagged_intvar = q_shape[0]
+    batch_size_str = jagged_intvar.batch_dim()._attrs["name"]
+    if len(jagged_intvar.jagged_dims()) != 1:
+        raise RuntimeError(
+            "Only support 1 jagged dim in grouped_fmha_style_b2b_bmm for now! "
+            f"Current jagged intvar: {jagged_intvar}"
+        )
+    max_seq_length_dim = jagged_intvar.jagged_dims()[0].max_value()
+    max_seq_length_str = (
+        str(max_seq_length_dim.value())
+        if isinstance(max_seq_length_dim, IntImm)
+        else max_seq_length_dim._attrs["name"]
+    )
+    num_heads_str = q_shape[1]._attrs["name"]
+    offset = f"{jagged_intvar.offsets_var_name()}.data[0]"
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        query=q_name,
+        key=k_name,
+        value=v_name,
+        bias=bias_name,
+        accum_ptr="global_workspace_",
+        batch_size=batch_size_str,
+        seq_length=max_seq_length_str,
+        seq_length_kv=max_seq_length_str,
+        num_heads=num_heads_str,
+        offset=offset,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/builder_cmake.py b/python/aitemplate/backend/cuda/builder_cmake.py
new file mode 100644
index 000000000..d8c310e90
--- /dev/null
+++ b/python/aitemplate/backend/cuda/builder_cmake.py
@@ -0,0 +1,476 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# A custom compile engine for CMake for CUDA backend. It can handle both Windows
+# and Linux use cases. Unlike the default make-based compiler engine, this one
+# is an experimental one. It was mostly needed to generate cpp/cu files for a
+# given model once and then do some custom debugging / research in an IDE.
+
+from __future__ import annotations
+
+import logging
+
+import os
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import jinja2
+
+from aitemplate.backend.target import Target
+
+from aitemplate.utils.debug_settings import AITDebugSettings
+
+from aitemplate.utils.misc import is_linux, is_windows, short_str
+
+
+# pylint: disable=W0221,C0103
+
+
+_LOGGER = logging.getLogger(__name__)
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
+CMAKELISTS_TXT_TEMPLATE = """
+project({{CMAKE_PROJECT}})
+
+# idk which version is actually needed
+cmake_minimum_required(VERSION 3.20)
+
+set(SOURCE_FILES
+{{CMAKE_SOURCE_FILES}}
+)
+
+set(HEADER_FILES
+{{CMAKE_HEADER_FILES}}
+)
+
+set(STANDALONE_SOURCE_FILES
+{{CMAKE_STANDALONE_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_SOURCE_FILES
+{{CMAKE_THIRD_PARTY_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_HEADER_FILES
+{{CMAKE_THIRD_PARTY_HEADER_FILES}}
+)
+
+{% if is_linux %}
+# linux only
+add_custom_command(
+    OUTPUT {{CMAKE_CONSTANTS_OBJ}}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMAND ld -r -b binary -o ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}} {{CMAKE_CONSTANTS_BIN}}
+    COMMAND objcopy --rename-section .data=.lrodata,alloc,load,readonly,data,contents ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}} ${CMAKE_BINARY_DIR}/{{CMAKE_CONSTANTS_OBJ}}
+    DEPENDS {{CMAKE_CONSTANTS_BIN}}
+)
+{% endif %}
+
+enable_language(CUDA)
+set(CMAKE_CUDA_ARCHITECTURES {{CUDA_ARCH}})
+
+find_package(CUDAToolkit REQUIRED)
+
+{% if cuda_static %}
+set(CUDA_RUNTIME_LIBRARY Static)
+{% endif %}
+
+# this is needed to be able to pass \\ into command lline options
+set(WorkaroundCmakeCompileOptions {{CMAKE_COMPILE_OPTIONS}})
+
+# compile a supplemental library
+add_library(objlib OBJECT ${SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES} {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_include_directories(objlib PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_compile_options(objlib PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(objlib PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+
+
+# compile model library
+add_library(model SHARED $<TARGET_OBJECTS:objlib> {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_include_directories(model PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_link_libraries(model
+    {% if not cuda_static %}CUDA::cudart{% endif %}
+)
+target_compile_options(model PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(model PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+
+
+{% if build_standalone %}
+# compile a standalone executable
+add_executable(standalone $<TARGET_OBJECTS:objlib> {% if is_linux %}{{CMAKE_CONSTANTS_OBJ}}{% endif %})
+target_sources(standalone PRIVATE ${STANDALONE_SOURCE_FILES})
+target_include_directories(standalone PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_link_libraries(standalone
+    {% if not cuda_static %}CUDA::cudart{% endif %}
+)
+target_compile_options(standalone PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(standalone PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+{% endif %}
+"""
+
+
+CMAKELISTS_TXT_PROFILER_TEMPLATE = """
+project({{CMAKE_PROJECT}})
+
+cmake_minimum_required(VERSION 3.20)
+
+set(SOURCE_FILES
+{{CMAKE_SOURCE_FILES}}
+)
+
+set(HEADER_FILES
+{{CMAKE_HEADER_FILES}}
+)
+
+set(THIRD_PARTY_SOURCE_FILES
+{{CMAKE_THIRD_PARTY_SOURCE_FILES}}
+)
+
+set(THIRD_PARTY_HEADER_FILES
+{{CMAKE_THIRD_PARTY_HEADER_FILES}}
+)
+
+enable_language(CUDA)
+set(CMAKE_CUDA_ARCHITECTURES {{CUDA_ARCH}})
+
+find_package(CUDAToolkit REQUIRED)
+
+{% if cuda_static %}
+set(CUDA_RUNTIME_LIBRARY Static)
+{% endif %}
+
+# this is needed to be able to pass \\ into command lline options
+set(WorkaroundCmakeCompileOptions {{CMAKE_COMPILE_OPTIONS}})
+
+# compile a binary
+add_executable(profiler ${SOURCE_FILES} ${THIRD_PARTY_SOURCE_FILES})
+target_include_directories(profiler PRIVATE ${HEADER_FILES} ${THIRD_PARTY_HEADER_FILES})
+target_compile_options(profiler PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${WorkaroundCmakeCompileOptions}>)
+set_target_properties(profiler PROPERTIES LINKER_LANGUAGE CXX CXX_STANDARD 17)
+"""
+
+
+def _run_cmd(command_line: str, timeout, custom_env: Optional[Dict[str, str]] = None):
+    _LOGGER.info(f"Executing {command_line}")
+    if custom_env is not None:
+        for key, value in custom_env.items():
+            _LOGGER.info(f"Extra environment var {key}={value}")
+        environ = {**os.environ, **custom_env}
+    else:
+        environ = os.environ.copy()
+    proc = subprocess.Popen(  # noqa: P204
+        command_line,
+        shell=True,
+        env=environ,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    try:
+        out, err = proc.communicate(timeout)
+    except subprocess.TimeoutExpired as e:
+        proc.kill()
+        out, err = proc.communicate()
+        raise e
+    finally:
+        stdout = out.decode()
+        stderr = err.decode()
+        if proc.returncode != 0:
+            _LOGGER.info(f"command stdout:\n\n{stdout}")
+            _LOGGER.info(f"command stderr:\n\n{stderr}")
+
+            raise RuntimeError("command has failed.")
+        else:
+            _LOGGER.debug(f"command stdout:\n\n{stdout}")
+            _LOGGER.debug(f"command stderr:\n\n{stderr}")
+
+
+def _render_path(path: Union[Path, str]) -> str:
+    # shlex.quote is designed for unit
+    p = Path(path).as_posix()
+    return '"' + str(p) + '"'
+
+
+def _files_as_str(filenames: Union[Path, str, List[Union[Path, str]]]) -> str:
+    if isinstance(filenames, str) or isinstance(filenames, Path):
+        return _render_path(filenames)
+    elif isinstance(filenames, list):
+        return "\n".join([f"\t{_render_path(filename)}" for filename in filenames])
+    else:
+        raise TypeError()
+
+
+class BuilderCMake:
+    """BuilderCMake is a module to compile generated source code
+    files into binary objects via CMake.
+    """
+
+    def __init__(self, n_cpus: int = -1, timeout: int = 180) -> None:
+        self._timeout = timeout
+        self._n_cpus = n_cpus
+
+    def _build_compile_options(self) -> List[str]:
+        # I don't want to move this functionality to target_def.py,
+        # because target_def.py is about GNU and GNU only.
+
+        device_compiler_options = Target.current().get_device_compiler_options()
+        if is_windows():
+            host_compiler_options = ["-Xcompiler=/Zc:__cplusplus"]
+        else:
+            host_compiler_options = [
+                f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+                for opt in Target.current().get_host_compiler_options()
+            ]
+
+        compile_options = device_compiler_options + host_compiler_options
+
+        # this is a workaround around how cmake handles \ character
+        compile_options = [option.replace("\\,", "\\\\,") for option in compile_options]
+
+        # done
+        return compile_options
+
+    def make_profilers(self, generated_profilers, workdir: Path):
+        file_pairs = [f for gp in generated_profilers for f in gp]
+        if not file_pairs:
+            return
+
+        # todo: combine multiple profiler in a single CMake project?
+        cmake_template = jinja2.Template(CMAKELISTS_TXT_PROFILER_TEMPLATE)
+
+        include_directories = Target.current().get_include_directories()
+
+        compile_options = self._build_compile_options()
+
+        # go ahead
+        for source, profiler_binary in file_pairs:
+            test_name = short_str(str(source))
+
+            build_dir = Path(source).parent / test_name
+            build_dir.mkdir(exist_ok=True)
+
+            rendered = cmake_template.render(
+                CMAKE_PROJECT=test_name,
+                CMAKE_SOURCE_FILES=_files_as_str("../" + str(Path(source).name)),
+                # # todo: this can be done once we're able to track header files
+                # # properly
+                # CMAKE_HEADER_FILES=_files_as_str(
+                #     [Path(header).name for header in generated_sources.headers]
+                # ),
+                CMAKE_HEADER_FILES=_files_as_str([]),
+                CMAKE_THIRD_PARTY_HEADER_FILES=_files_as_str(include_directories),
+                CMAKE_THIRD_PARTY_SOURCE_FILES=_files_as_str([]),
+                CMAKE_COMPILE_OPTIONS=" ".join(compile_options),
+                CUDA_ARCH=Target.current()._arch,
+                cuda_static=is_windows(),
+                is_linux=is_linux(),
+            )
+
+            cmake_filename = build_dir / "CMakeLists.txt"
+            with cmake_filename.open("w") as f:
+                f.write(rendered)
+
+            # execute cmake
+            cmake_build_dir = build_dir / "build"
+            cmake_cmd = Target.current().cmake()
+            cmake_command_line = f"{_render_path(cmake_cmd)} -B {_render_path(cmake_build_dir)} -S {_render_path(build_dir)}"
+            _run_cmd(cmake_command_line, self._timeout)
+
+            # execute build system
+            if is_windows():
+                # use msbuild
+                msbuild_sln_filename = cmake_build_dir / f"{test_name}.sln"
+                msbuild_command_line = f"msbuild {_render_path(msbuild_sln_filename)}"
+                if self._n_cpus < 0:
+                    msbuild_command_line += " -m"
+                else:
+                    msbuild_command_line += f" -m:{self._n_cpus}"
+
+                if Target.current()._ndebug == 1:
+                    msbuild_command_line += " /property:Configuration=Release"
+                else:
+                    msbuild_command_line += " /property:Configuration=Debug"
+
+                _run_cmd(msbuild_command_line, self._timeout)
+
+                target_profiler_filename = profiler_binary
+                if Target.current()._ndebug == 1:
+                    compiled_profiler_filename = (
+                        cmake_build_dir / "Release" / "profiler.exe"
+                    )
+                    shutil.copy(compiled_profiler_filename, target_profiler_filename)
+                else:
+                    compiled_profiler_filename = (
+                        cmake_build_dir / "Debug" / "profiler.exe"
+                    )
+                    shutil.copy(compiled_profiler_filename, target_profiler_filename)
+            else:
+                # use make
+                make_cmd = Target.current().make()
+                make_command_line = f"{make_cmd} -C {_render_path(cmake_build_dir)}"
+                if self._n_cpus < 0:
+                    make_command_line += " -j"
+                else:
+                    make_command_line += f" -j{self._n_cpus}"
+
+                _run_cmd(make_command_line, self._timeout)
+
+                target_profiler_filename = profiler_binary
+                compiled_profiler_filename = cmake_build_dir / "profiler"
+                shutil.copy(compiled_profiler_filename, target_profiler_filename)
+
+    def make(
+        self,
+        file_pairs,
+        dll_name: str,
+        workdir: Path,
+        test_name: str,
+        debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+        allow_cache=False,
+    ):
+        # Generates a CMakeLists.txt files and builds a model and a standalone project
+
+        if allow_cache:
+            _LOGGER.warning("Caching is not yet supported")
+
+        build_dir = Path(workdir) / test_name
+
+        cmake_template = jinja2.Template(CMAKELISTS_TXT_TEMPLATE)
+
+        include_directories = Target.current().get_include_directories()
+
+        compile_options = self._build_compile_options()
+
+        # check constants.bin
+        cmake_third_party_source_files = []
+        cmake_third_party_header_files = []
+        constants_bin_file = build_dir / "constants.bin"
+
+        if constants_bin_file.exists():
+            if is_windows():
+                resource_file = build_dir / "constants.rc"
+                with resource_file.open("w") as f:
+                    f.write('constant_bin CUSTOMDATA "constants.bin"')
+                cmake_third_party_source_files.append("constants.rc")
+
+                cmake_third_party_header_files.append("windll.h")
+                cmake_third_party_source_files.append("windll.cu")
+
+        # windows uses static CUDA build, linux uses dynamic one
+        rendered = cmake_template.render(
+            CMAKE_PROJECT=test_name,
+            CMAKE_SOURCE_FILES=_files_as_str(
+                [
+                    Path(source).name
+                    for (source, _) in file_pairs
+                    if Path(source).name not in ["standalone.cu", "windll.cu"]
+                ]
+            ),
+            # # todo: this can be done once we're able to track header files
+            # # properly
+            # CMAKE_HEADER_FILES=_files_as_str(
+            #     [Path(header).name for header in final_sources.headers]
+            # ),
+            CMAKE_HEADER_FILES=_files_as_str([]),
+            CMAKE_STANDALONE_SOURCE_FILES=_render_path("standalone.cu"),
+            CMAKE_THIRD_PARTY_SOURCE_FILES=_files_as_str(
+                cmake_third_party_source_files
+            ),
+            CMAKE_THIRD_PARTY_HEADER_FILES=_files_as_str(
+                include_directories + cmake_third_party_header_files
+            ),
+            CMAKE_CONSTANTS_BIN=_render_path("constants.bin"),
+            CMAKE_CONSTANTS_OBJ=_render_path("constants.obj"),
+            CMAKE_COMPILE_OPTIONS=" ".join(compile_options),
+            CUDA_ARCH=Target.current()._arch,
+            cuda_static=is_windows(),
+            is_linux=is_linux(),
+            build_standalone=debug_settings.gen_standalone,
+        )
+
+        cmake_filename = build_dir / "CMakeLists.txt"
+        with cmake_filename.open("w") as f:
+            f.write(rendered)
+
+        # execute cmake
+        cmake_build_dir = build_dir / "build"
+        cmake_cmd = Target.current().cmake()
+        cmake_command_line = f"{_render_path(cmake_cmd)} -B {_render_path(cmake_build_dir)} -S {_render_path(build_dir)}"
+        _run_cmd(cmake_command_line, self._timeout)
+
+        # execute build system
+        if is_windows():
+            # use msbuild
+            msbuild_sln_filename = cmake_build_dir / f"{test_name}.sln"
+            msbuild_command_line = f"msbuild {_render_path(msbuild_sln_filename)}"
+            if self._n_cpus < 0:
+                msbuild_command_line += " -m"
+            else:
+                msbuild_command_line += f" -m:{self._n_cpus}"
+
+            if Target.current()._ndebug == 1:
+                msbuild_command_line += " /property:Configuration=Release"
+            else:
+                msbuild_command_line += " /property:Configuration=Debug"
+
+            _run_cmd(msbuild_command_line, self._timeout)
+
+            # copy
+            target_library_filename = build_dir / dll_name
+            target_standalone_filename = build_dir / f"{Path(dll_name).stem}.exe"
+            if Target.current()._ndebug == 1:
+                # copy library to where it is supposed to be
+                compiled_library_filename = cmake_build_dir / "Release" / "model.dll"
+                shutil.copy(compiled_library_filename, target_library_filename)
+
+                if debug_settings.gen_standalone:
+                    # copy standalone file to where it is supposed to be
+                    compiled_standlone_filename = (
+                        cmake_build_dir / "Release" / "standalone.exe"
+                    )
+                    shutil.copy(compiled_standlone_filename, target_standalone_filename)
+            else:
+                # copy library to where it is supposed to be
+                compiled_library_filename = cmake_build_dir / "Debug" / "model.dll"
+                shutil.copy(compiled_library_filename, target_library_filename)
+
+                if debug_settings.gen_standalone:
+                    # copy standalone file to where it is supposed to be
+                    compiled_standlone_filename = (
+                        cmake_build_dir / "Debug" / "standalone.exe"
+                    )
+                    shutil.copy(compiled_standlone_filename, target_standalone_filename)
+        else:
+            # use make
+            make_cmd = Target.current().make()
+            make_command_line = f"{make_cmd} -C {_render_path(cmake_build_dir)}"
+            if self._n_cpus < 0:
+                make_command_line += " -j"
+            else:
+                make_command_line += f" -j{self._n_cpus}"
+
+            _run_cmd(make_command_line, self._timeout)
+
+            # copy library to where it is supposed to be
+            target_library_filename = build_dir / dll_name
+            compiled_library_filename = cmake_build_dir / "libmodel.so"
+            shutil.copy(compiled_library_filename, target_library_filename)
+
+            if debug_settings.gen_standalone:
+                # copy standalone file to where it is supposed to be
+                target_standalone_filename = build_dir / (Path(dll_name).stem)
+                compiled_standalone_filename = cmake_build_dir / "standalone"
+                shutil.copy(compiled_standalone_filename, target_standalone_filename)
diff --git a/python/aitemplate/backend/cuda/common/__init__.py b/python/aitemplate/backend/cuda/common/__init__.py
index 2115b6952..4971d840f 100644
--- a/python/aitemplate/backend/cuda/common/__init__.py
+++ b/python/aitemplate/backend/cuda/common/__init__.py
@@ -16,4 +16,4 @@
 """
 CUDA Common module init
 """
-from .dummy_op import *
+from aitemplate.backend.cuda.common.dummy_op import *
diff --git a/python/aitemplate/backend/cuda/common/dummy_op.py b/python/aitemplate/backend/cuda/common/dummy_op.py
index da293ee4e..8a81b6087 100644
--- a/python/aitemplate/backend/cuda/common/dummy_op.py
+++ b/python/aitemplate/backend/cuda/common/dummy_op.py
@@ -18,7 +18,7 @@
 
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("cuda.size.gen_function")
diff --git a/python/aitemplate/backend/cuda/conv2d/__init__.py b/python/aitemplate/backend/cuda/conv2d/__init__.py
index 09703e7b8..e18c91cdf 100644
--- a/python/aitemplate/backend/cuda/conv2d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv2d/__init__.py
@@ -16,7 +16,7 @@
 """
 cuda conv2d module init
 """
-from . import (
+from aitemplate.backend.cuda.conv2d import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add,
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index c9fb0ef4a..67d27ebbb 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -24,9 +24,10 @@
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal.common import add_profiler, build_profiler
+from aitemplate.backend.target import Target
 
-from ...target import Target
-from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
+from aitemplate.utils import alignment
 
 
 KERNEL_KEY_TEMPLATE = jinja2.Template(
@@ -35,8 +36,440 @@
 """
 )
 
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance_name}}::ElementCompute;
+{{indent}}//  TODO: cast to right dtype
+{{indent}}typename {{instance_name}}::Arguments arguments{
+{{indent}}    problem_size,                                                                 // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},                                  // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},                              // TensorRefA const & ref_B
+{% if is_bias %}
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},  // TensorRefC const & ref_C
+{% elif is_bias_add %}
+{{indent}}    {static_cast<{{dtype}}*>(res_ptr), layout_C},                                 // TensorRefC const & ref_C
+{% else %}
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},                                 // TensorRefC const & ref_C
+{% endif %}
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},                                 // TensorRefC const & ref_D
+{% if is_bias %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},                       // typename EpilogueOutputOp::Params const & output_op
+{% elif is_bias_add %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},                       // typename EpilogueOutputOp::Params const & output_op
+{{indent}}    cutlass::conv::SplitKMode::kSerial,                                           // SplitKMode const & split_k_mode
+{{indent}}    static_cast<{{dtype}}*>(bias_ptr),                                            // void * ptr_Vector
+{{indent}}    nullptr,                                                                      // void * ptr_Tensor
+{{indent}}    0,                                                                            // typename LayoutC::Stride::Index ldr
+{{indent}}    *out_ch,                                                                      // typename LayoutC::Stride::Index ldt
+{% else %}
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},                       // typename EpilogueOutputOp::Params const & output_op
+{% endif %}
+{{indent}}};
+{{indent}}{{instance_name}} conv_op;
+{% if is_profiler %}
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE_{{instance_name}} = workspace_size;
+{% endif %}
+{{indent}}auto status = conv_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cstdio>
+#include <stdexcept>
+
+#include "cutlass/cutlass.h"
+{% if is_transpose %}
+#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
+{% elif is_depthwise %}
+#include "cutlass/conv/kernel/default_depthwise_fprop.h"
+{% else %}
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/kernel/default_conv2d_group_fprop.h"
+{% endif %}
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      static char msg[2048];                                                          \\
+      snprintf(msg, sizeof(msg), "[%s] Got cutlass error: %s at: %s",                 \\
+        __FILE__, cutlassGetStatusString(error), __LINE__);                           \\
+      fprintf(stderr, msg);                                                           \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{functions}}
+"""
+)
+
+FUNCTION_TEMPLATE = jinja2.Template(
+    """
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+    void* out_ptr,
+{% if is_bias %}
+    void* bias_ptr,
+{% elif is_bias_add %}
+    void* bias_ptr,
+    void* res_ptr,
+{% endif %}
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    int strideh,
+    int dilationh,
+    int padh,
+    int stridew,
+    int dilationw,
+    int padw,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+
+  int i32_batch = *batch;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNHWC;
+  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
+{% if is_depthwise%}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
+{% elif is_transpose %}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_in_ch, i32_kernel_h, i32_kernel_w, i32_out_ch)));
+{% else %}
+  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+{% endif %}
+  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+{% if is_transpose %}
+    {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},    // cutlass::Tensor4DCoord input_size
+{% else %}
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},           // cutlass::Tensor4DCoord input_size
+{% endif %}
+{% if is_depthwise%}
+    {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},  // cutlass::Tensor4DCoord filter_size
+{% elif is_transpose%}
+    {i32_in_ch, i32_kernel_h, i32_kernel_w, i32_out_ch},  // cutlass::Tensor4DCoord filter_size
+{% else %}
+    {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},  // cutlass::Tensor4DCoord filter_size
+{% endif %}
+    {padh, padh, padw, padw},                                 // cutlass::Tensor4DCoord padding
+    {strideh, stridew},                                     // cutlass::MatrixCoord stride
+    {dilationh, dilationw},                                 // cutlass::MatrixCoord dilation
+{% if is_transpose %}
+    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},           // cutlass::Tensor4DCoord output_size
+{% else %}
+    {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},    // cutlass::Tensor4DCoord output_size
+{% endif %}
+    cutlass::conv::Mode::kCrossCorrelation,               // cutlass::conv::Mode mode
+    1                                                     // int split_k_slices
+  );
+
+  {{exec_paths}}
+
+  throw std::runtime_error(
+    "Unsupported workload for this conv2d specialization."
+  );
+}
+"""
+)
+
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      &runtime,
+{{indent}}      &workspace_size,
+{{indent}}      {{ni}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{strideh}},
+{{indent}}      {{dilationh}},
+{{indent}}      {{padh}},
+{{indent}}      {{stridew}},
+{{indent}}      {{dilationw}},
+{{indent}}      {{padw}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {
+{{indent}}    runtime = 0;
+{{indent}}    workspace_size = 0;
+{{indent}}  }
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}  std::cout << "OP:{{conv_op_name}},"
+{{indent}}            << "TIME:" << runtime << ","
+{{indent}}            << "WS:" << workspace_size << std::endl;
+{{indent}}}
+"""
+)
+
+BENCHMARK_DECL_TEMPLATE = jinja2.Template(
+    """
+int benchmark_{{function_name}} (
+  float*,
+  size_t*,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int64_t,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  uint8_t*,
+  cudaStream_t
+);
+"""
+)
+
+BENCHMARK_TEMPLATE = jinja2.Template(
+    """
+int benchmark_{{function_name}} (
+  float* runtime,
+  size_t* workspace_size,
+  int64_t NI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t HO,
+  int64_t WO,
+  int strideh,
+  int dilationh,
+  int padh,
+  int stridew,
+  int dilationw,
+  int padw,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementInputA = typename {{instance_name}}::ElementA;
+  using ElementInputB = typename {{instance_name}}::ElementB;
+  using ElementOutput = typename {{instance_name}}::ElementC;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name}}::LayoutA> x({NI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> w({CO, KH, KW, CI});
+{% if is_bias %}
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> b({(int)CO, 1, 1, 1});
+{% elif is_bias_add %}
+  cutlass::HostTensor<ElementInputB, typename {{instance_name}}::LayoutB> b({(int)CO, 1, 1, 1});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name}}::LayoutC> r({NO, HO, WO, CO});
+{% endif %}
+  cutlass::HostTensor<ElementOutput, typename {{instance_name}}::LayoutC> y({NO, HO, WO, CO});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  *runtime = runtime_ms;
+  *workspace_size = GLOBAL_WORKSPACE_SIZE_{{instance_name}};
+  return 0;
+}
+"""
+)
+
+PROFILER_BENCHMARK_TEMPLATE = jinja2.Template(
+    """
+static size_t GLOBAL_WORKSPACE_SIZE_{{instance_name}} = 0;
+
+{{op_source}}
+
+{{benchmark}}
+"""
+)
 
-def kernel_name(op):
+PROFILER_MAIN_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+
+#include "cutlass/cutlass.h"
+
+{{benchmark_decls}}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_h = std::stoi(argv[2]);
+  int64_t in_w = std::stoi(argv[3]);
+  int64_t in_ch = std::stoi(argv[4]);
+  int64_t kernel_h = std::stoi(argv[5]);
+  int64_t kernel_w = std::stoi(argv[6]);
+  int64_t out_ch = std::stoi(argv[7]);
+  int strideh = std::stoi(argv[8]);
+  int padh = std::stoi(argv[9]);
+  int dilationh = std::stoi(argv[10]);
+  int stridew = std::stoi(argv[11]);
+  int padw = std::stoi(argv[12]);
+  int dilationw = std::stoi(argv[13]);
+
+{{shape_func}}
+
+  float runtime = 0;
+  size_t workspace_size = 0;
+  uint8_t* global_workspace_ = nullptr;
+  cudaStream_t stream = nullptr;
+
+{{benchmark_instances}}
+
+  return 0;
+}
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+{% if is_bias %}
+  void*,
+{% elif is_bias_add %}
+  void*,
+  void*,
+{% endif %}
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{out_ptr}},
+{% if is_bias %}
+{{indent}}    {{bias_ptr}},
+{% elif is_bias_add %}
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{res_ptr}},
+{% endif %}
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{strideh}},
+{{indent}}    {{dilationh}},
+{{indent}}    {{padh}},
+{{indent}}    {{stridew}},
+{{indent}}    {{dilationw}},
+{{indent}}    {{padw}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def kernel_name(op, layout=None):
     """generate cuda kernel name"""
     from cutlass_lib import library
 
@@ -45,7 +478,8 @@ def kernel_name(op):
     opcode_class_name = library.OpcodeClassNames[
         op.tile_description.math_instruction.opcode_class
     ]
-    layout = op.layout_name()
+    if layout is None:
+        layout = op.layout_name()
     align_ab = op.A.alignment
     align_c = op.C.alignment
     name = KERNEL_KEY_TEMPLATE.render(
@@ -71,60 +505,269 @@ def emit_instance(op):
     return op_def
 
 
-def extract_config(func_attrs, f_proc_op=None):
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    skip_simt_kernels=False,
+    f_apply_special_config=None,
+    op_kind=None,
+    op_layout=None,
+):
     """Extracts cutlass config for conv kernels."""
     import copy
 
     import cutlass_lib
 
-    def f_proc_op_default(op):
-        # import cutlass_lib
-        ret = []
+    spec = CUDASpec()
+    lib_dtype = spec.dtype_to_lib_type(dtype)
+
+    if lib_dtype == "float":
+        data_type = cutlass_lib.library.DataType.f32
+        acc_type = cutlass_lib.library.DataType.f32
+    elif "half" in lib_dtype:
         data_type = cutlass_lib.library.DataType.f16
         acc_type = cutlass_lib.library.DataType.f32
         # check target use fp16 acc
         if "use_fp16_acc" in Target.current()._kwargs:
             if Target.current()._kwargs["use_fp16_acc"]:
                 acc_type = cutlass_lib.library.DataType.f16
+    elif "bfloat16" in lib_dtype:
+        data_type = cutlass_lib.library.DataType.bf16
+        acc_type = cutlass_lib.library.DataType.f32
+        # check target use fp16 acc
+        if "use_fp16_acc" in Target.current()._kwargs:
+            if Target.current()._kwargs["use_fp16_acc"]:
+                acc_type = cutlass_lib.library.DataType.bf16
+    else:
+        raise RuntimeError(f"Unsupported dtype {lib_dtype}")
+
+    def f_proc_op(op):
+        ret = []
+        if (
+            skip_simt_kernels
+            and op.tile_description.math_instruction.opcode_class
+            == cutlass_lib.library.OpcodeClass.Simt
+        ):
+            return ret
 
         if (
             op.A.element == data_type
             and op.B.element == data_type
             and op.C.element == data_type
             and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
+            and op.tile_description.math_instruction.element_accumulator == acc_type
         ):
-
             op = copy.deepcopy(op)
+
             # set epilogue
             epilogue_name = func_attrs["epilogue"]
             op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
             op.element_epilogue = acc_type
-            # set C alignment
-            for i in [8, 4, 2, 1]:
+
+            # apply special config if required
+            if f_apply_special_config is not None:
+                op = f_apply_special_config(func_attrs, op)
+
+            # set C alignment depending on the dtype
+            for i in alignment.get_alignments(dtype):
                 op = copy.deepcopy(op)
                 op.C.alignment = i
                 ret.append(op)
+
         return ret
 
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
+    if op_kind is None:
+        op_kind = cutlass_lib.library.OperationKind.Conv2d
     extract_ops = list(Target.current()._operators[op_kind].items())
+    conv_kind = cutlass_lib.library.ConvKind.Fprop
 
+    conv_ops = OrderedDict()
     for _, value in extract_ops:
         op = value[0]
         if op.conv_kind == conv_kind:
-            if f_proc_op is None:
-                ret = f_proc_op_default(op)
-            else:
-                ret = f_proc_op(op)
+            ret = f_proc_op(op)
             if len(ret) > 0:
                 for op_inst in ret:
-                    key = kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
+                    key = kernel_name(op_inst, layout=op_layout)
+                    conv_ops[key] = op_inst
+    return conv_ops
+
+
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+    f_emit_instance=emit_instance,
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+    is_depthwise=False,
+    extra_header="",
+    instance_name_base="DeviceConvFwdInstance",
+):
+    """Generate profiler sources."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    func_call_extra_args = {}
+    if is_bias:
+        func_call_extra_args = {
+            "bias_ptr": "b.device_data()",
+        }
+    elif is_bias_add:
+        func_call_extra_args = {
+            "bias_ptr": "b.device_data()",
+            "res_ptr": "r.device_data()",
+        }
+
+    benchmark_decls = []
+    benchmark_instances = []
+    profiler_benchmarks = {}
+
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
+        config = f_emit_instance(op)
+        config_name = extract_config_name(config)
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        function_name = f"{op_type}_{op_name}"
+
+        exec_program = EXEC_TEMPLATE.render(
+            indent="  ",
+            is_profiler=True,
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            instance_name=instance_name,
+            dtype=dtype,
+        )
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name,
+            name=instance_name,
+            config=config,
+        )
+        function = FUNCTION_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            is_transpose=is_transpose,
+            is_depthwise=is_depthwise,
+            function_name=function_name,
+            shape_function="",
+            exec_paths=exec_program,
+        )
+        op_source = SRC_TEMPLATE.render(
+            is_transpose=is_transpose,
+            is_depthwise=is_depthwise,
+            extra_header=extra_header,
+            instances=instance,
+            functions=function,
+        )
+
+        func_call = FUNC_CALL_TEMPLATE.render(
+            indent="  ",
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            func_name=function_name,
+            in_ptr="x.device_data()",
+            weight_ptr="w.device_data()",
+            out_ptr="y.device_data()",
+            **func_call_extra_args,
+            p_batch="&NI",
+            p_out_ch="&CO",
+            p_in_ch="&CI",
+            p_kernel_h="&KH",
+            p_kernel_w="&KW",
+            p_in_h="&HI",
+            p_in_w="&WI",
+            p_out_batch="&NO",
+            p_out_h="&HO",
+            p_out_w="&WO",
+            strideh="strideh",
+            dilationh="dilationh",
+            padh="padh",
+            stridew="stridew",
+            dilationw="dilationw",
+            padw="padw",
+        )
+        benchmark = BENCHMARK_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
+            instance_name_base=instance_name_base,
+            function_name=function_name,
+            func_call=func_call,
+            instance_name=instance_name,
+        )
+
+        profiler_benchmarks[function_name] = PROFILER_BENCHMARK_TEMPLATE.render(
+            op_source=op_source,
+            benchmark=benchmark,
+            instance_name=instance_name,
+        )
+
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            ho="HO",
+            wo="WO",
+            strideh="SH",
+            dilationh="DH",
+            padh="PH",
+            stridew="SW",
+            dilationw="DW",
+            padw="PW",
+        )
+        benchmark_instances.append(benchmark_instance)
+
+        benchmark_decl = BENCHMARK_DECL_TEMPLATE.render(
+            function_name=function_name,
+        )
+        benchmark_decls.append(benchmark_decl)
+
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_h",
+        x_dim2="in_w",
+        x_dim3="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_h",
+        w_dim2="kernel_w",
+        strideh="strideh",
+        dilateh="dilationh",
+        padh="padh",
+        stridew="stridew",
+        dilatew="dilationw",
+        padw="padw",
+    )
+    profiler_main_code = PROFILER_MAIN_TEMPLATE.render(
+        shape_func=shape_func,
+        benchmark_decls="\n".join(benchmark_decls),
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    code = {profiler_filename: profiler_main_code}
+    for benchmark_filename, benchmark_code in profiler_benchmarks.items():
+        code[benchmark_filename] = benchmark_code
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
+
+    # build
+    return build_profiler(file_pairs)
 
 
 def extract_config_name(config):
@@ -139,13 +782,14 @@ def extract_config_name(config):
 
 def gen_function(
     func_attrs,
-    instance_template,
-    exec_template,
-    src_template,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+    is_depthwise=False,
     extra_header="",
 ):
     """Function definition codegen."""
@@ -156,22 +800,24 @@ def gen_function(
     inst_def_flag = set()
     instances = {}
     instance_decl = ""
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     for key, value in exec_path.items():
         fname = "f" + sha1(key.encode()).hexdigest()
-
-        emit_instance = f_emit_instance(op_instance[value])
+        emitted_instance = f_emit_instance(op_instance[value])
         if value not in inst_def_flag:
             inst_def_flag.add(value)
-            config = emit_instance
+            config = emitted_instance
         else:
             config = ""
-        inst = instance_template.render(
-            config=config, name=fname, config_name=extract_config_name(emit_instance)
+        inst = INSTANCE_TEMPLATE.render(
+            config=config,
+            name=fname,
+            config_name=extract_config_name(emitted_instance),
         )
         instances[key] = inst
         instance_decl += inst
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -182,9 +828,12 @@ def gen_function(
         w_dim0="*out_ch",
         w_dim1="*kernel_h",
         w_dim2="*kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="strideh",
+        dilateh="dilationh",
+        padh="padh",
+        stridew="stridew",
+        dilatew="dilationw",
+        padw="padw",
         div="/",
     )
     shape_save_func = shape_save_template.render(
@@ -195,38 +844,133 @@ def gen_function(
         y_dim3="*out_ch",
     )
     shape_func = shape_eval_func + shape_save_func
+
     exec_paths = ""
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
-        program = exec_template.render(
+        program = EXEC_TEMPLATE.render(
+            is_bias=is_bias,
+            is_bias_add=is_bias_add,
             indent=" " * 4,
-            instance=fname,
+            instance_name=fname,
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
-    return src_template.render(
-        instances=instance_decl,
+
+    function = FUNCTION_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        is_transpose=is_transpose,
+        is_depthwise=is_depthwise,
         function_name=func_name,
         shape_function=shape_func,
         exec_paths=exec_paths,
+    )
+
+    return SRC_TEMPLATE.render(
+        is_transpose=is_transpose,
+        is_depthwise=is_depthwise,
         extra_header=extra_header,
+        instances=instance_decl,
+        functions=function,
+    )
+
+
+def gen_function_decl(
+    func_attrs,
+    is_bias=False,
+    is_bias_add=False,
+):
+    func_name = func_attrs["name"]
+
+    return FUNC_DECL_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        func_name=func_name,
+    )
+
+
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+    is_bias=False,
+    is_bias_add=False,
+    is_transpose=False,
+):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+
+    func_call_extra_args = {}
+    if is_bias:
+        b = func_attrs["inputs"][2]
+        func_call_extra_args = {
+            "bias_ptr": b._attrs["name"],
+        }
+    elif is_bias_add:
+        b = func_attrs["inputs"][2]
+        r = func_attrs["inputs"][3]
+        func_call_extra_args = {
+            "bias_ptr": b._attrs["name"],
+            "res_ptr": r._attrs["name"],
+        }
+
+    out_ch = wshape[-1]._attrs["name"] if is_transpose else wshape[0]._attrs["name"]
+    return FUNC_CALL_TEMPLATE.render(
+        is_bias=is_bias,
+        is_bias_add=is_bias_add,
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        out_ptr=y._attrs["name"],
+        **func_call_extra_args,
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + out_ch,
+        p_in_ch="&" + xshape[3]._attrs["name"],
+        p_kernel_h="&" + wshape[1]._attrs["name"],
+        p_kernel_w="&" + wshape[2]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        strideh=func_attrs["stride"]
+        if isinstance(func_attrs["stride"], int)
+        else func_attrs["stride"][0],
+        dilationh=func_attrs["dilate"]
+        if isinstance(func_attrs["dilate"], int)
+        else func_attrs["dilate"][0],
+        padh=func_attrs["pad"]
+        if isinstance(func_attrs["pad"], int)
+        else func_attrs["pad"][0],
+        stridew=func_attrs["stride"]
+        if isinstance(func_attrs["stride"], int)
+        else func_attrs["stride"][1],
+        dilationw=func_attrs["dilate"]
+        if isinstance(func_attrs["dilate"], int)
+        else func_attrs["dilate"][1],
+        padw=func_attrs["pad"]
+        if isinstance(func_attrs["pad"], int)
+        else func_attrs["pad"][1],
+        indent=indent,
     )
 
 
-def cal_align_ab(x_shape: List[int]) -> int:
+def _cal_align_ab(x_shape: List[int], dtype="float16") -> int:
     """Returns input alignment."""
     k = x_shape[3]  # CI
-    if k % 8 == 0:
-        return 8
-    if k % 4 == 0:
-        return 4
-    if k % 2 == 0:
-        return 2
-    raise RuntimeError(f"a/b is not aligned {x_shape=}")
+    return alignment.find_max_alignment(k, dtype)
 
 
-def function_filter(cfg, func_attrs, x_shape):
+def function_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -243,12 +987,16 @@ def function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    ab_alignment = cal_align_ab(x_shape)
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = _cal_align_ab(x_shape, dtype=dtype)
+
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
+
     if align_c != func_attrs["epilogue_alignment"]:
         return False
     if align_ab != ab_alignment:
         return False
+
     return True
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
index aa48d92f9..28b92127e 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_activation.py
@@ -13,368 +13,66 @@
 #  limitations under the License.
 #
 """
-common templates for conv_bias_activation subgraph
+common functions for conv_bias_activation subgraph
 """
-import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
-
-from . import common
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0103,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
+EXTRA_HEADER = """
 #include <cutlass/epilogue/thread/linear_combination_bias_relu.h>
 #include <cutlass/epilogue/thread/linear_combination_hardswish.h>
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
 """
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
 
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
 
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        is_bias=True,
+        extra_header=EXTRA_HEADER,
+    )
 
 
-def gen_profiler(func_attrs, workdir, shape_template, extra_header=""):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_bias=True,
+        extra_header=EXTRA_HEADER,
     )
 
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
 
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ",
-            is_profiler=True,
-            instance=name,
-            dtype=dtype,
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-            extra_header=extra_header,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
+def gen_function_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
-def gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
     )
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
index 5439f1fc0..64641ba2f 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_bias_add_activation.py
@@ -13,339 +13,90 @@
 #  limitations under the License.
 #
 """
-common template for conv2d bias act residual add
+common functions for conv2d bias act residual add
 """
-import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
-
-from . import common
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0301,C0103
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(res_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}    cutlass::conv::SplitKMode::kSerial,
-{{indent}}    static_cast<{{dtype}}*>(bias_ptr),
-{{indent}}    nullptr, 0, *out_ch
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-return;
-"""
-)
-
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
+EXTRA_HEADER = """
 #include <cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h>
 #include <cutlass/epilogue/thread/linear_combination_residual_block.h>
+"""
 
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    void* res_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-      {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-      {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-      {pad, pad, pad, pad},
-      {stride, stride},
-      {dilation, dilation},
-      {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-      cutlass::conv::Mode::kCrossCorrelation,
-      1
-  );
 
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    activation_op_name="Identity",
+    binary_op_name="Plus",
+    unary_op_name="Identity",
+):
+    def set_ops(func_attrs, op):
+        import cutlass_lib
 
+        op.activation_op = cutlass_lib.library.EpilogueMathName[activation_op_name]
+        op.binary_op = cutlass_lib.library.EpilogueMathName[binary_op_name]
+        op.unary_op = cutlass_lib.library.EpilogueMathName[unary_op_name]
 
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-{{op_func}}
+        return op
 
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
+    return common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        skip_simt_kernels=True,
+        f_apply_special_config=set_ops,
+    )
 
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
 
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> r({NO, HO, WO, CO});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       r.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       r.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        is_bias_add=True,
+        extra_header=EXTRA_HEADER,
+    )
 
-"""
-)
 
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+    shape_save_template,
+):
+    return common.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_bias_add=True,
+        extra_header=EXTRA_HEADER,
+    )
 
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
 
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    {{res_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
+def gen_function_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias_add=True,
+    )
 
 
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        is_bias_add=True,
     )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
index c24f0a4db..d6059eeeb 100644
--- a/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/common_conv2d_few_channels.py
@@ -16,96 +16,38 @@
 common functions for conv2d op with few channels(< 8)
 """
 
-from collections import OrderedDict
-
-from ...target import Target
-from . import common
-
-
-def apply_special_config(func_attrs, op):
-    import cutlass_lib
-
-    x = func_attrs["inputs"][0]
-    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
-
-    if in_ch == 3:
-        # By default we don't use it since the perf is worse than pad4+fixchannel
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
-        op.A.alignment = 1
-        op.B.alignment = 1
-        op.tile_description.stages = 2
-    elif in_ch in [2, 4, 8]:
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
-        op.A.alignment = in_ch
-        op.B.alignment = in_ch
-        op.tile_description.stages = 3
-    return op
-
-
-def extract_config(func_attrs):
-    """extract epilogue for conv op
-
-    Parameters
-    ----------
-    func_attrs : Dict
-        [description] op attributes
-
-    Returns
-    -------
-    [type]: Dict
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
-    """
-    import copy
-
-    import cutlass_lib
-
-    def f_proc_op_special(op):
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            op = apply_special_config(func_attrs, op)
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            ret = f_proc_op_special(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = common.kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
+from aitemplate.backend.cuda.conv2d import common
+from aitemplate.utils import alignment
+
+
+def extract_config(func_attrs, dtype="float16"):
+    def apply_special_config(func_attrs, op):
+        import cutlass_lib
+
+        x = func_attrs["inputs"][0]
+        in_ch = x._attrs["shape"][-1]._attrs["values"][0]
+
+        # Make sure to use NoneGroup here. Otherwise, we'll generate Conv2dGroupFprop,
+        # which doesn't have template specializations for either of the iterator
+        # algorithms below, resulting in "incomplete type is not allowed" errors.
+        op.group_mode = cutlass_lib.library.GroupMode.NoneGroup
+        if in_ch == 3:
+            # By default we don't use it since the perf is worse than pad4+fixchannel
+            op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
+            op.A.alignment = 1
+            op.B.alignment = 1
+            op.tile_description.stages = 2
+        elif in_ch in alignment.get_alignments(dtype):
+            op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
+            op.A.alignment = in_ch
+            op.B.alignment = in_ch
+            op.tile_description.stages = 3
+
+        return op
+
+    return common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        skip_simt_kernels=True,
+        f_apply_special_config=apply_special_config,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
new file mode 100644
index 000000000..666b66f2d
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv2d/common_transposed_conv2d.py
@@ -0,0 +1,63 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+common functions for transposed conv2d
+"""
+
+import re
+
+from aitemplate.backend.cuda.conv2d import common
+
+
+def _conv_transpose_instance(op_def):
+    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
+    tmp = re.sub(
+        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
+        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
+        tmp,
+    )
+    return tmp
+
+
+def emit_instance(op, f_instance_convertor=_conv_transpose_instance):
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+    op_def = emiter.emit(op)
+    op_def = f_instance_convertor(op_def)
+    return op_def
+
+
+def extract_config(
+    func_attrs,
+    dtype="float16",
+    skip_simt_kernels=False,
+    op_kind=None,
+    op_layout=None,
+):
+    def apply_special_config(func_attrs, op):
+        import cutlass_lib
+
+        op.group_mode = cutlass_lib.library.GroupMode.NoneGroup
+        return op
+
+    return common.extract_config(
+        func_attrs,
+        dtype,
+        skip_simt_kernels,
+        f_apply_special_config=apply_special_config,
+        op_kind=op_kind,
+        op_layout=op_layout,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d.py b/python/aitemplate/backend/cuda/conv2d/conv2d.py
index 3279e2ff7..3a8ad2820 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d.py
@@ -15,395 +15,81 @@
 """
 Codegen for conv2d.
 """
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, CI});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 @registry.reg("cuda.conv2d.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_config(func_attrs):
     """Populates conv2d cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=func_attrs["inputs"][0]._attrs["dtype"],
+    )
 
 
 @registry.reg("cuda.conv2d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """Codegen for conv2d profiler."""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
     )
-    file_pairs = []
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    for op_name, op in op_instance.items():
-        config = common.emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.conv2d.gen_function")
-def gen_function(
+def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d function."""
     return common.gen_function(
-        func_attrs,
-        INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d.func_decl")
-def conv2d_gen_function_decl(func_attrs):
+def conv2d_func_decl(
+    func_attrs,
+):
     """Codegen for conv2d function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
+def conv2d_func_call(
+    func_attrs,
+    indent="  ",
+):
     """Codegen for conv2d function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -420,4 +106,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
index c4fb32c42..adc7b0253 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias.py
@@ -15,58 +15,84 @@
 """
 conv2d bias codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_config(
+    func_attrs,
+    dtype="float16",
+):
     """Populates all available conv2d configs into the op_instance field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """Codegen for conv2d profiler."""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.gen_function")
-def gen_function(
+def conv2d_bias_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d function."""
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias.func_decl")
-def conv2d_gen_function_decl(func_attrs):
+def conv2d_bias_func_decl(
+    func_attrs,
+):
     """Codegen for conv2d function declaration."""
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
+def conv2d_bias_func_call(
+    func_attrs,
+    indent="  ",
+):
     """Codegen for conv2d function call."""
-    return cba.gen_function_call(func_attrs, indent)
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -83,4 +109,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
index 07ecbbff6..c9762f6ec 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add.py
@@ -15,121 +15,81 @@
 """
 conv2d bias add codegen
 """
-from ... import registry
-from ...target import Target
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["Identity"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_identity_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Plus",
+        unary_op_name="Identity",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_identity_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.gen_function")
-def gen_function(
+def conv2d_bias_add_identity_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_identity_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_identity_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_identity.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_identity_filter(cfg, func_attrs, x_shape):
     """Generates function filter.
 
     Parameters
@@ -146,4 +106,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
index 09d975ae4..defcae4a4 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_hardswish.py
@@ -15,121 +15,85 @@
 """
 conv2d bias add hardswish codegen
 """
-from ... import registry
-from ...target import Target
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Add"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["HardSwish"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_hardswish_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Add",
+        unary_op_name="HardSwish",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_hardswish_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.gen_function")
-def gen_function(
+def conv2d_bias_add_hardswish_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_hardswish_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_hardswish_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_hardswish.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_hardswish_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -146,4 +110,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
index 5a5e7314b..cbbe02038 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_add_relu.py
@@ -15,121 +15,85 @@
 """
 conv2d bias add relu codegen
 """
-from ... import registry
-from ...target import Target
-from . import common, common_conv2d_bias_add_activation as cbaa
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_add_activation as cbaa,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    def fproc_f16(op):
-        import copy
-
-        import cutlass_lib
-
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-
-            op.activation_op = cutlass_lib.library.EpilogueMathName["Identity"]
-            op.binary_op = cutlass_lib.library.EpilogueMathName["Plus"]
-            op.unary_op = cutlass_lib.library.EpilogueMathName["ReLu"]
-
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    func_attrs["op_instance"] = common.extract_config(func_attrs, fproc_f16)
+def conv2d_bias_add_relu_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = cbaa.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        activation_op_name="Identity",
+        binary_op_name="Plus",
+        unary_op_name="ReLu",
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cbaa.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_add_relu_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cbaa.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.gen_function")
-def gen_function(
+def conv2d_bias_add_relu_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cbaa.INSTANCE_TEMPLATE,
-        cbaa.EXEC_TEMPLATE,
-        cbaa.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cbaa.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cbaa.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_add_relu_func_decl(
+    func_attrs,
+):
+    return cbaa.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    r = func_attrs["inputs"][3]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cbaa.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        res_ptr=r._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def conv2d_bias_add_relu_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cbaa.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
 
 @registry.reg("cuda.conv2d_bias_add_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_add_relu_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -146,4 +110,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
index 584eddbfe..5c618d87c 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_few_channels.py
@@ -15,138 +15,61 @@
 """
 specialize conv2d op with few channels(< 8)
 """
-from collections import OrderedDict
 
-from ... import registry
-from ...target import Target
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
-def apply_special_config(func_attrs, op):
-    import cutlass_lib
-
-    x = func_attrs["inputs"][0]
-    in_ch = x._attrs["shape"][-1]._attrs["values"][0]
-
-    if in_ch == 3:
-        # By default we don't use it since the perf is worse than pad4+fixchannel
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FewChannels
-        op.A.alignment = 1
-        op.B.alignment = 1
-        op.tile_description.stages = 2
-    elif in_ch in [2, 4, 8]:
-        op.iterator_algorithm = cutlass_lib.library.IteratorAlgorithm.FixedChannels
-        op.A.alignment = in_ch
-        op.B.alignment = in_ch
-        op.tile_description.stages = 3
-    return op
-
-
-def extract_config(func_attrs):
-    """extract epilogue for conv op
-
-    Parameters
-    ----------
-    func_attrs : Dict
-        [description] op attributes
-
-    Returns
-    -------
-    [type]: Dict
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
-    """
-    import copy
-
-    import cutlass_lib
-
-    def f_proc_op_special(op):
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.accumulator_type() == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            op = apply_special_config(func_attrs, op)
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv2d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv2d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            ret = f_proc_op_special(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = common.kernel_name(op_inst)
-                    conv2d_ops[key] = op_inst
-    return conv2d_ops
-
-
 @registry.reg("cuda.conv2d_bias_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
     ----------
     func_attrs : Dict
-        [description] op attributes
+        op attributes
     dtype : str, optional
-        [description] by default "float16"
+        by default "float16"
 
     Returns
     -------
-    [type]
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
+    None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -156,7 +79,7 @@ def gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -168,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -208,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
index ccdc3ae1e..9ab085cfd 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish.py
@@ -15,53 +15,79 @@
 """
 conv2d bias hardswish codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_hardswish_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_hardswish_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.gen_function")
-def gen_function(
+def conv2d_bias_hardswish_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_hardswish_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_hardswish_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_hardswish_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -78,4 +104,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
index f8de585fa..6f139a3ae 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -16,49 +16,60 @@
 specialize conv2d op with few channels(< 8)
 """
 
-from ... import registry
-
-from . import common, common_conv2d_bias_activation as cba
-from .common_conv2d_few_channels import extract_config
-
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_hardswish_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
     ----------
     func_attrs : Dict
-        [description] op attributes
+        op attributes
     dtype : str, optional
-        [description] by default "float16"
+        by default "float16"
 
     Returns
     -------
-    [type]
-        [description]
-
-    Raises
-    ------
-    NotImplementedError
-        [description]
+    None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_hardswish_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_hardswish_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -68,7 +79,7 @@ def gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -80,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_hardswish_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_hardswish_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_hardswish_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_hardswish_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -120,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
index 920e13d5c..1b3726f66 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu.py
@@ -15,53 +15,79 @@
 """
 conv2d bias relu codegen
 """
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_relu_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_relu_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.gen_function")
-def gen_function(
+def conv2d_bias_relu_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_relu_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_relu_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_relu_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -78,4 +104,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
index 39019c5f1..d1663a7f5 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_relu_few_channels.py
@@ -16,15 +16,21 @@
 specialize conv2d op with few channels(< 8)
 """
 
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
-from .common_conv2d_few_channels import extract_config
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import (
+    common,
+    common_conv2d_bias_activation as cba,
+    common_conv2d_few_channels as cfc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_bias_relu_few_channels_config(
+    func_attrs,
+    dtype="float16",
+):
     """extract configurations for profiling
 
     Parameters
@@ -38,19 +44,32 @@ def conv2d_config(func_attrs, dtype="float16"):
     -------
     None
     """
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = cfc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def conv2d_bias_relu_few_channels_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
     """generate code for profiling"""
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.gen_function")
-def gen_function(
+def conv2d_bias_relu_few_channels_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -60,7 +79,7 @@ def gen_function(
     ----------
     func_attrs : Dict
         [description] attributes of conv2d op
-    exec_cond_remplate : [type]
+    exec_cond_template : [type]
         [description]
     shape_eval_template : [type]
         [description]
@@ -72,30 +91,40 @@ def gen_function(
     [type]
         [description]
     """
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_relu_few_channels_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_relu_few_channels_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_relu_few_channels.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_relu_few_channels_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -112,4 +141,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
index cbb896e71..0fef123cf 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_bias_sigmoid.py
@@ -16,53 +16,79 @@
 conv2d bias sigmoid codegen
 """
 
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_conv2d_bias_activation as cba
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+def conv2d_bias_sigmoid_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = common.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    return cba.gen_profiler(func_attrs, workdir, shape_template)
+def conv2d_bias_sigmoid_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return cba.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.gen_function")
-def gen_function(
+def conv2d_bias_sigmoid_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
-    return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        cba.SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+    return cba.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
     )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def conv2d_bias_sigmoid_func_decl(
+    func_attrs,
+):
+    return cba.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    return cba.gen_function_call(func_attrs, indent)
+def conv2d_bias_sigmoid_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return cba.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.conv2d_bias_sigmoid.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def conv2d_bias_sigmoid_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -79,4 +105,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
index bb166baa1..2630d565e 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise.py
@@ -17,283 +17,14 @@
 """
 from collections import OrderedDict
 
-import jinja2
+from aitemplate.backend import registry
 
 from aitemplate.backend.backend_spec import CUDASpec
-
-from ... import registry
-from ...target import Target
-from . import common
+from aitemplate.backend.cuda.conv2d import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-INSTANCE_TEMPLATE = jinja2.Template(
-    """
-{{config}}
-using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
-"""
-)
-
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_depthwise_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1,
-        i32_in_ch
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 def conv_dw_instance(op_def):
     op_def = op_def.replace("DefaultConv2dFprop", "DefaultDepthwiseFprop")
@@ -325,13 +56,18 @@ def apply_special_config(func_attrs, op):
     return op
 
 
-def extract_config(func_attrs):
+def extract_config(func_attrs, dtype="float16"):
     import copy
 
     import cutlass_lib
 
-    def f_proc_op_special(op):
-        ret = []
+    spec = CUDASpec()
+    lib_dtype = spec.dtype_to_lib_type(dtype)
+
+    if lib_dtype == "float":
+        data_type = cutlass_lib.library.DataType.f32
+        acc_type = cutlass_lib.library.DataType.f32
+    else:
         data_type = cutlass_lib.library.DataType.f16
         acc_type = cutlass_lib.library.DataType.f32
         # check target use fp16 acc
@@ -339,14 +75,16 @@ def f_proc_op_special(op):
             if Target.current()._kwargs["use_fp16_acc"]:
                 acc_type = cutlass_lib.library.DataType.f16
 
+    def f_proc_op_special(op):
+        ret = []
         if (
             op.A.element == data_type
             and op.B.element == data_type
             and op.C.element == data_type
             and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
             and op.accumulator_type() == acc_type
+            and op.group_mode == cutlass_lib.library.GroupMode.NoneGroup
         ):
-
             op = copy.deepcopy(op)
             # set epilogue
             epilogue_name = func_attrs["epilogue"]
@@ -381,80 +119,41 @@ def f_proc_op_special(op):
 @registry.reg("cuda.conv2d_depthwise.config")
 def conv2d_depthwise_config(func_attrs, dtype="float16"):
     """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = extract_config(func_attrs)
+    func_attrs["op_instance"] = extract_config(func_attrs, dtype)
 
 
 @registry.reg("cuda.conv2d_depthwise.gen_profiler")
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     shape_template,
-    exec_template=EXEC_TEMPLATE,
-    src_template=SRC_TEMPLATE,
-    profiler_template=PROFILER_TEMPLATE,
 ):
-    """Codegen for conv2d_depthwise profiler."""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=emit_instance,
+        is_depthwise=True,
+        instance_name_base="DeviceConvFwdInstance",
     )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = exec_template.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = src_template.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = profiler_template.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
 
 
 @registry.reg("cuda.conv2d_depthwise.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d_depthwise function."""
     return common.gen_function(
-        func_attrs,
-        INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        is_depthwise=True,
         f_emit_instance=emit_instance,
     )
 
@@ -462,37 +161,16 @@ def gen_function(
 @registry.reg("cuda.conv2d_depthwise.func_decl")
 def conv2d_depthwise_gen_function_decl(func_attrs):
     """Codegen for conv2d_depthwise function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.conv2d_depthwise.func_call")
 def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
     """Codegen for conv2d_depthwise function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
     )
 
diff --git a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
index af33fecce..dcfe362cb 100644
--- a/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/conv2d_depthwise_bias.py
@@ -15,356 +15,69 @@
 """
 Codegen for conv2d_depthwise.
 """
-import jinja2
 
-from ... import registry
-from . import common, conv2d_depthwise as cdw
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, conv2d_depthwise as cdw
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-EXEC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
-{{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNHWC::Stride(0)},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-{{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
-{% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
-{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
-{{indent}}workspace = local_workspace.get();
-{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
-{% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
-{{indent}}CUTLASS_CHECK(status);
-{{indent}}return;
-"""
-)
-
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include <string>
-#include <stdexcept>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_depthwise_fprop.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, 1)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-    {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, 1},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1,
-        i32_in_ch
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-PROFILER_TEMPLATE = jinja2.Template(
-    """
-size_t GLOBAL_WORKSPACE_SIZE = 0;
-
-{{op_func}}
-
-int main(int argc, char** argv) {
-  int64_t batch = std::stoi(argv[1]);
-  int64_t in_h = std::stoi(argv[2]);
-  int64_t in_w = std::stoi(argv[3]);
-  int64_t in_ch = std::stoi(argv[4]);
-  int64_t kernel_h = std::stoi(argv[5]);
-  int64_t kernel_w = std::stoi(argv[6]);
-  int64_t out_ch = std::stoi(argv[7]);
-  int stride = std::stoi(argv[8]);
-  int pad = std::stoi(argv[9]);
-  int dilation = std::stoi(argv[10]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-
-  uint8_t* global_workspace = nullptr;
-  cudaStream_t stream = nullptr;
-
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KH, KW, 1u});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> b({(int)CO, 1, 1, 1});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, HO, WO, CO});
-
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       b.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KH,
-       &KW,
-       &HI,
-       &WI,
-       &NO,
-       &HO,
-       &WO,
-       stride,
-       dilation,
-       pad,
-       stream);
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
-}
-
-"""
-)
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  void*,
-  void*,
-  void*,
-  void*,
-  uint8_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int64_t*,
-  int,
-  int,
-  int,
-  cudaStream_t
-);
-"""
-)
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{in_ptr}},
-{{indent}}    {{weight_ptr}},
-{{indent}}    {{out_ptr}},
-{{indent}}    {{bias_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{p_batch}},
-{{indent}}    {{p_out_ch}},
-{{indent}}    {{p_in_ch}},
-{{indent}}    {{p_kernel_h}},
-{{indent}}    {{p_kernel_w}},
-{{indent}}    {{p_in_h}},
-{{indent}}    {{p_in_w}},
-{{indent}}    {{p_out_batch}},
-{{indent}}    {{p_out_h}},
-{{indent}}    {{p_out_w}},
-{{indent}}    {{stride}},
-{{indent}}    {{dilation}},
-{{indent}}    {{pad}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
 
 @registry.reg("cuda.conv2d_depthwise_bias.config")
 def conv2d_depthwise_config(func_attrs, dtype="float16"):
     """Populates conv2d_depthwise cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = cdw.extract_config(func_attrs)
+    func_attrs["op_instance"] = cdw.extract_config(func_attrs, dtype)
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
     """Codegen for conv2d_depthwise_bias profiler."""
-    return cdw.gen_profiler(
-        func_attrs,
-        workdir,
-        shape_template,
-        exec_template=EXEC_TEMPLATE,
-        src_template=SRC_TEMPLATE,
-        profiler_template=PROFILER_TEMPLATE,
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=cdw.emit_instance,
+        is_bias=True,
+        is_depthwise=True,
+        instance_name_base="DeviceConvFwdInstance",
     )
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     """Codegen for conv2d_depthwise_bias function."""
     return common.gen_function(
-        func_attrs,
-        cdw.INSTANCE_TEMPLATE,
-        EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
         f_emit_instance=cdw.emit_instance,
+        is_bias=True,
+        is_depthwise=True,
     )
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.func_decl")
 def conv2d_depthwise_gen_function_decl(func_attrs):
     """Codegen for conv2d_depthwise_bias function declaration."""
-    func_name = func_attrs["name"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
 @registry.reg("cuda.conv2d_depthwise_bias.func_call")
 def conv2d_depthwise_gen_function_call(func_attrs, indent="  "):
     """Codegen for conv2d_depthwise_bias function call."""
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
index 574f0d361..e7186ebce 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d.py
@@ -15,232 +15,85 @@
 """
 transposed conv2d op codegen
 """
-import re
-
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
-from ... import registry
-from . import common, conv2d
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
-          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
 
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
-
-  cutlass::conv::Conv2dProblemSize problem_size(
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-def conv_transpose_instance(op_def):
-    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
-    tmp = re.sub(
-        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
-        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
-        tmp,
+@registry.reg("cuda.transposed_conv2d.config")
+def transposed_conv2d_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = ctc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
     )
-    return tmp
-
-
-def emit_instance(op, f_instance_convertor=conv_transpose_instance):
-    """Emits cutlass instance."""
-    import cutlass_lib
 
-    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
-    op_def = emiter.emit(op)
-    op_def = f_instance_convertor(op_def)
-    return op_def
 
-
-@registry.reg("cuda.transposed_conv2d.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+@registry.reg("cuda.transposed_conv2d.gen_profiler")
+def transposed_conv2d_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=ctc.emit_instance,
+        is_transpose=True,
+        instance_name_base="DeviceConvBwdInstance",
+    )
 
 
 @registry.reg("cuda.transposed_conv2d.gen_function")
-def gen_function(
+def transposed_conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
-        func_attrs,
-        conv2d.INSTANCE_TEMPLATE,
-        conv2d.EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
-        f_emit_instance=emit_instance,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        f_emit_instance=ctc.emit_instance,
+        is_transpose=True,
     )
 
 
 @registry.reg("cuda.transposed_conv2d.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return conv2d.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def transposed_conv2d_func_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.transposed_conv2d.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return conv2d.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def transposed_conv2d_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_transpose=True,
     )
 
 
-@registry.reg("cuda.transposed_conv2d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
-    )
-    file_pairs = []
-
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvBwdInstance"
-        instance = conv2d.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = conv2d.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = conv2d.PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
 @registry.reg("cuda.transposed_conv2d.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def transposed_conv2d_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -257,4 +110,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
index 35b08d19f..54f298cdc 100644
--- a/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/backend/cuda/conv2d/transposed_conv2d_bias.py
@@ -15,239 +15,95 @@
 """
 transposed conv2d + bias + (relu) codegen
 """
-import re
-
-import jinja2
-
-from aitemplate.backend.backend_spec import CUDASpec
-
-from ... import registry
-from . import common, common_conv2d_bias_activation as cba
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.conv2d import common, common_transposed_conv2d as ctc
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
-SRC_TEMPLATE = jinja2.Template(
-    """
-#include <iostream>
-#include "cutlass/cutlass.h"
-#include "cutlass/conv/kernel/default_conv2d_dgrad.h"
-#include "cutlass/conv/device/implicit_gemm_convolution.h"
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/reference/host/tensor_fill.h"
-
-{{extra_header}}
-
-#define CUTLASS_CHECK(status)                                                         \\
-  {                                                                                   \\
-    cutlass::Status error = status;                                                   \\
-    if (error != cutlass::Status::kSuccess) {                                         \\
-      auto msg = std::string("Got cutlass error: ") + cutlassGetStatusString(error) + \\
-          " at: " + std::to_string(__LINE__);                                         \\
-      std::cerr << msg << std::endl;                                                  \\
-      throw std::runtime_error(msg);                                                  \\
-    }                                                                                 \\
-  }
-
-{{instances}}
-
-{{instances_def}}
-
-void {{function_name}} (
-    void* in_ptr,
-    void* weight_ptr,
-    void* out_ptr,
-    void* bias_ptr,
-    uint8_t* workspace,
-    int64_t* batch,
-    int64_t* out_ch,
-    int64_t* in_ch,
-    int64_t* kernel_h,
-    int64_t* kernel_w,
-    int64_t* in_h,
-    int64_t* in_w,
-    int64_t* out_batch,
-    int64_t* out_h,
-    int64_t* out_w,
-    int stride,
-    int dilation,
-    int pad,
-    cudaStream_t stream
-  ) {
-
-  {{shape_function}}
-  int i32_batch = *batch;
-  int i32_in_h = *in_h;
-  int i32_in_w = *in_w;
-  int i32_in_ch = *in_ch;
-  int i32_out_ch = *out_ch;
-  int i32_kernel_h = *kernel_h;
-  int i32_kernel_w = *kernel_w;
-  int i32_out_batch = *out_batch;
-  int i32_out_h = *out_h;
-  int i32_out_w = *out_w;
-
-  using cutlass::layout::TensorNHWC;
-  TensorNHWC layout_A(TensorNHWC::packed(cutlass::make_Coord(i32_batch, i32_in_h, i32_in_w, i32_in_ch)));
-  TensorNHWC layout_B(TensorNHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch)));
-  TensorNHWC layout_C(TensorNHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_h, i32_out_w, i32_out_ch)));
 
-  cutlass::conv::Conv2dProblemSize problem_size(
-        {i32_out_batch, i32_out_h, i32_out_w, i32_out_ch},
-        {i32_out_ch, i32_kernel_h, i32_kernel_w, i32_in_ch},
-        {pad, pad, pad, pad},
-        {stride, stride},
-        {dilation, dilation},
-        {i32_batch, i32_in_h, i32_in_w, i32_in_ch},
-        cutlass::conv::Mode::kCrossCorrelation,
-        1
-  );
-
-  {{exec_paths}}
-  throw std::runtime_error(
-      "Unsupported workload for this conv2d specialization."
-  );
-}
-"""
-)
-
-
-def _conv_transpose_instance(op_def):
-    tmp = op_def.replace("DefaultConv2dFprop", "DefaultConv2dDgrad")
-    tmp = re.sub(
-        r"cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<\d>",
-        "cutlass::conv::threadblock::StridedDgradIdentityThreadblockSwizzle<1>",
-        tmp,
+@registry.reg("cuda.transposed_conv2d_bias.config")
+@registry.reg("cuda.transposed_conv2d_bias_relu.config")
+def transposed_conv2d_bias_config(
+    func_attrs,
+    dtype="float16",
+):
+    func_attrs["op_instance"] = ctc.extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
     )
-    return tmp
-
-
-def emit_instance(op, f_instance_convertor=_conv_transpose_instance):
-    import cutlass_lib
-
-    emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
-    op_def = emiter.emit(op)
-    op_def = f_instance_convertor(op_def)
-    return op_def
 
 
-@registry.reg("cuda.transposed_conv2d_bias.config")
-@registry.reg("cuda.transposed_conv2d_bias_relu.config")
-def conv2d_config(func_attrs, dtype="float16"):
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+@registry.reg("cuda.transposed_conv2d_bias.gen_profiler")
+@registry.reg("cuda.transposed_conv2d_bias_relu.gen_profiler")
+def transposed_conv2d_bias_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    shape_template,
+):
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        shape_template=shape_template,
+        f_emit_instance=ctc.emit_instance,
+        is_bias=True,
+        is_transpose=True,
+        instance_name_base="DeviceConvBwdInstance",
+    )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.gen_function")
 @registry.reg("cuda.transposed_conv2d_bias_relu.gen_function")
-def gen_function(
+def transposed_conv2d_bias_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return common.gen_function(
-        func_attrs,
-        cba.INSTANCE_TEMPLATE,
-        cba.EXEC_TEMPLATE,
-        SRC_TEMPLATE,
-        exec_cond_remplate,
-        shape_eval_template,
-        shape_save_template,
-        f_emit_instance=emit_instance,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        shape_eval_template=shape_eval_template,
+        shape_save_template=shape_save_template,
+        f_emit_instance=ctc.emit_instance,
+        is_bias=True,
+        is_transpose=True,
     )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.func_decl")
 @registry.reg("cuda.transposed_conv2d_bias_relu.func_decl")
-def conv2d_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
-    return cba.FUNC_DECL_TEMPLATE.render(func_name=func_name)
+def transposed_conv2d_bias_func_decl(
+    func_attrs,
+):
+    return common.gen_function_decl(
+        func_attrs=func_attrs,
+        is_bias=True,
+    )
 
 
 @registry.reg("cuda.transposed_conv2d_bias.func_call")
 @registry.reg("cuda.transposed_conv2d_bias_relu.func_call")
-def conv2d_gen_function_call(func_attrs, indent="  "):
-    x = func_attrs["inputs"][0]
-    xshape = x._attrs["shape"]
-    w = func_attrs["inputs"][1]
-    b = func_attrs["inputs"][2]
-    wshape = w._attrs["shape"]
-    y = func_attrs["outputs"][0]
-    yshape = y._attrs["shape"]
-    return cba.FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        in_ptr=x._attrs["name"],
-        weight_ptr=w._attrs["name"],
-        out_ptr=y._attrs["name"],
-        bias_ptr=b._attrs["name"],
-        p_batch="&" + xshape[0]._attrs["name"],
-        p_out_ch="&" + wshape[0]._attrs["name"],
-        p_in_ch="&" + xshape[3]._attrs["name"],
-        p_kernel_h="&" + wshape[1]._attrs["name"],
-        p_kernel_w="&" + wshape[2]._attrs["name"],
-        p_in_h="&" + xshape[1]._attrs["name"],
-        p_in_w="&" + xshape[2]._attrs["name"],
-        p_out_batch="&" + yshape[0]._attrs["name"],
-        p_out_h="&" + yshape[1]._attrs["name"],
-        p_out_w="&" + yshape[2]._attrs["name"],
-        stride=func_attrs["stride"],
-        dilation=func_attrs["dilate"],
-        pad=func_attrs["pad"],
+def transposed_conv2d_bias_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return common.gen_function_call(
+        func_attrs=func_attrs,
         indent=indent,
+        is_bias=True,
+        is_transpose=True,
     )
 
 
-@registry.reg("cuda.transposed_conv2d_bias.gen_profiler")
-@registry.reg("cuda.transposed_conv2d_bias_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    # shape func
-    shape_func = shape_template.render(
-        indent="  ",
-        dtype="int64_t ",
-        div="/",
-        x_dim0="batch",
-        x_dim1="in_h",
-        x_dim2="in_w",
-        x_dim3="in_ch",
-        w_dim0="out_ch",
-        w_dim1="kernel_h",
-        w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
-    )
-    backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
-        config = emit_instance(op)
-
-        config_name = common.extract_config_name(config)
-        name = "DeviceConvBwdInstance"
-        instance = cba.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = cba.EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
-        )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
-        )
-        code = cba.PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
 @registry.reg("cuda.transposed_conv2d_bias.filter")
 @registry.reg("cuda.transposed_conv2d_bias_relu.filter")
-def conv2d_function_filter(cfg, func_attrs, x_shape):
+def transposed_conv2d_bias_filter(
+    cfg,
+    func_attrs,
+    x_shape,
+):
     """Generates function filter.
 
     Parameters
@@ -264,4 +120,8 @@ def conv2d_function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    return common.function_filter(cfg, func_attrs, x_shape)
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        x_shape=x_shape,
+    )
diff --git a/python/aitemplate/backend/cuda/conv3d/__init__.py b/python/aitemplate/backend/cuda/conv3d/__init__.py
index ba1388ae4..dadb06e64 100644
--- a/python/aitemplate/backend/cuda/conv3d/__init__.py
+++ b/python/aitemplate/backend/cuda/conv3d/__init__.py
@@ -15,6 +15,11 @@
 """
 CUDA conv3d module init
 """
-from . import conv3d, depthwise_conv3d
+from aitemplate.backend.cuda.conv3d import (
+    conv3d,
+    conv3d_bias,
+    depthwise_conv3d,
+    depthwise_conv3d_bias,
+)
 
-__all__ = ["conv3d", "depthwise_conv3d"]
+__all__ = ["conv3d", "conv3d_bias", "depthwise_conv3d", "depthwise_conv3d_bias"]
diff --git a/python/aitemplate/backend/cuda/conv3d/common.py b/python/aitemplate/backend/cuda/conv3d/common.py
index 461c4e6e9..2dd9f9f66 100644
--- a/python/aitemplate/backend/cuda/conv3d/common.py
+++ b/python/aitemplate/backend/cuda/conv3d/common.py
@@ -16,16 +16,22 @@
 CUDA conv3d common functions
 """
 import re
-from collections import OrderedDict
 from hashlib import sha1
 from typing import List
 
 import jinja2
 
 from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv2d.common import (
+    extract_config as conv2d_extract_config,
+)
+from aitemplate.backend.cuda.gemm_universal.common import (  # noqa: F401
+    add_profiler,
+    build_profiler,
+)
+
+from aitemplate.utils import alignment
 
-from ...target import Target
-from ..gemm_universal.common import add_profiler, build_profiler  # noqa: F401
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
@@ -140,29 +146,6 @@ def gen_function_call(func_attrs, indent="  "):
 )
 
 
-def kernel_name(op):
-    """generate cuda kernel name"""
-    from cutlass_lib import library
-
-    threadblock = op.tile_description.procedural_name()
-    extended_name = op.extended_name()
-    opcode_class_name = library.OpcodeClassNames[
-        op.tile_description.math_instruction.opcode_class
-    ]
-    layout = "ndhwc"  # op.layout_name()
-    align_ab = op.A.alignment
-    align_c = op.C.alignment
-    name = KERNEL_KEY_TEMPLATE.render(
-        threadblock=threadblock,
-        extended_name=extended_name,
-        opcode_class_name=opcode_class_name,
-        layout=layout,
-        align_ab=align_ab,
-        align_c=align_c,
-    )
-    return name.replace("\n", "")
-
-
 def emit_instance(op):
     """emit instance"""
     import cutlass_lib
@@ -176,61 +159,16 @@ def emit_instance(op):
     return op_def
 
 
-def extract_config(func_attrs, f_proc_op=None):
+def extract_config(func_attrs, dtype="float16"):
     """Extracts cutlass config for conv kernels."""
-    import copy
-
     import cutlass_lib
 
-    def f_proc_op_default(op):
-        # import cutlass_lib
-        ret = []
-        data_type = cutlass_lib.library.DataType.f16
-        acc_type = cutlass_lib.library.DataType.f32
-        # check target use fp16 acc
-        if "use_fp16_acc" in Target.current()._kwargs:
-            if Target.current()._kwargs["use_fp16_acc"]:
-                acc_type = cutlass_lib.library.DataType.f16
-
-        if (
-            op.A.element == data_type
-            and op.B.element == data_type
-            and op.C.element == data_type
-            and op.iterator_algorithm == cutlass_lib.library.IteratorAlgorithm.Optimized
-            and op.tile_description.math_instruction.element_accumulator == acc_type
-        ):
-
-            op = copy.deepcopy(op)
-            # set epilogue
-            epilogue_name = func_attrs["epilogue"]
-            op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
-            op.element_epilogue = acc_type
-            # set C alignment
-            for i in [8, 4, 2, 1]:
-                op = copy.deepcopy(op)
-                op.C.alignment = i
-                ret.append(op)
-        return ret
-
-    op_kind = cutlass_lib.library.OperationKind.Conv3d
-    conv_kind = cutlass_lib.library.ConvKind.Fprop
-    ret = []
-    conv3d_ops = OrderedDict()
-    extract_ops = list(Target.current()._operators[op_kind].items())
-
-    for _, value in extract_ops:
-        op = value[0]
-        if op.conv_kind == conv_kind:
-            if f_proc_op is None:
-                ret = f_proc_op_default(op)
-            else:
-                ret = f_proc_op(op)
-            if len(ret) > 0:
-                for op_inst in ret:
-                    key = kernel_name(op_inst)
-                    conv3d_ops[key] = op_inst
-
-    return conv3d_ops
+    return conv2d_extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        op_kind=cutlass_lib.library.OperationKind.Conv3d,
+        op_layout="ndhwc",
+    )
 
 
 def extract_config_name(config):
@@ -248,7 +186,7 @@ def gen_function(
     instance_template,
     exec_template,
     src_template,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     f_emit_instance=emit_instance,
@@ -271,7 +209,7 @@ def gen_function(
             config = f_emit_instance(op_instance[value])
             inst_def_flag.add(value)
         else:
-            config = ""
+            continue
         inst = instance_template.render(
             config=config, name=fname, config_name=extract_config_name(config)
         )
@@ -313,7 +251,7 @@ def gen_function(
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
         program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
@@ -324,16 +262,10 @@ def gen_function(
     )
 
 
-def cal_align_ab(x_shape: List[int]) -> int:
+def cal_align_ab(x_shape: List[int], dtype="float16") -> int:
     """Returns input alignment."""
     k = x_shape[4]  # CI
-    if k % 8 == 0:
-        return 8
-    if k % 4 == 0:
-        return 4
-    if k % 2 == 0:
-        return 2
-    raise RuntimeError("a/b is not aligned")
+    return alignment.find_max_alignment(k, dtype)
 
 
 def function_filter(cfg, func_attrs, x_shape):
@@ -353,7 +285,8 @@ def function_filter(cfg, func_attrs, x_shape):
     bool
         If input cfg should be filtered.
     """
-    ab_alignment = cal_align_ab(x_shape)
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = cal_align_ab(x_shape, dtype=dtype)
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
diff --git a/python/aitemplate/backend/cuda/conv3d/common_bias.py b/python/aitemplate/backend/cuda/conv3d/common_bias.py
new file mode 100644
index 000000000..9ecda801b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/common_bias.py
@@ -0,0 +1,302 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA conv3d common functions
+"""
+import re
+from hashlib import sha1
+from typing import List
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv2d.common import (
+    extract_config as conv2d_extract_config,
+)
+from aitemplate.backend.cuda.gemm_universal.common import (  # noqa: F401
+    add_profiler,
+    build_profiler,
+)
+
+from aitemplate.utils import alignment
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  {% if has_bias %}
+  void*,
+  {% endif %}
+  void*,
+  int64_t*, // kernel size
+  int64_t*,
+  int64_t*,
+  int, // strides
+  int,
+  int,
+  int, // padding
+  int,
+  int,
+  int, // dilation
+  int,
+  int,
+  int64_t*, // in_batch
+  int64_t*, // in_ch
+  int64_t*, // in_t
+  int64_t*, // in_h
+  int64_t*, // in_w
+  int64_t*, // out_ch
+  int64_t*, // out_t
+  int64_t*, // out_h
+  int64_t*, // out_w
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{% if has_bias %}
+{{indent}}    {{bias_ptr}},
+{% endif %}
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_kernel_t}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{stride_t}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{padding_t}},
+{{indent}}    {{padding_h}},
+{{indent}}    {{padding_w}},
+{{indent}}    {{dilation_t}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{p_in_batch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_in_t}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_out_t}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+def gen_function_decl(func_name):
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, has_bias=True)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    b = func_attrs["inputs"][2]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        has_bias=True,
+        bias_ptr=b._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_in_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_t="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_t="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_t="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_t=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        padding_t=func_attrs["pad"][0],
+        padding_h=func_attrs["pad"][1],
+        padding_w=func_attrs["pad"][2],
+        dilation_t=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        indent=indent,
+    )
+
+
+KERNEL_KEY_TEMPLATE = jinja2.Template(
+    """
+cutlass{{opcode_class}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+"""
+)
+
+
+def emit_instance(op):
+    """emit instance"""
+    import cutlass_lib
+
+    emiter = cutlass_lib.conv3d_operation.EmitConv3dInstance()
+    op_def = emiter.emit(op)
+    return op_def
+
+
+def extract_config(func_attrs, dtype="float16"):
+    """Extracts cutlass config for conv kernels."""
+    import cutlass_lib
+
+    return conv2d_extract_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        op_kind=cutlass_lib.library.OperationKind.Conv3d,
+        op_layout="ndhwc",
+    )
+
+
+def extract_config_name(config):
+    """Extracts config name from a given config."""
+    pattern = re.compile(r"\s*using\s(.*?)\s=")
+    decl = config.split("\n")[2]
+    match = pattern.match(decl)
+    if match is None:
+        raise RuntimeError("Invalid config: \n" + config)
+    return match.groups()[0]
+
+
+def gen_function(
+    func_attrs,
+    instance_template,
+    exec_template,
+    src_template,
+    exec_cond_template,
+    shape_eval_template,
+    shape_save_template,
+    f_emit_instance=emit_instance,
+    extra_header="",
+):
+    """Function definition codegen."""
+    func_name = func_attrs["name"]
+    exec_path = func_attrs["exec_path"]
+    op_instance = func_attrs["op_instance"]
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    inst_def_flag = set()
+    instances = {}
+    instance_decl = ""
+    for key, value in exec_path.items():
+        fname = "f" + sha1(key.encode()).hexdigest()
+        if value not in inst_def_flag:
+            config = f_emit_instance(op_instance[value])
+            inst_def_flag.add(value)
+        else:
+            continue
+        inst = instance_template.render(
+            config=config, name=fname, config_name=extract_config_name(config)
+        )
+        instances[key] = inst
+        instance_decl += inst
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_d",
+        x_dim2="*in_h",
+        x_dim3="*in_w",
+        x_dim4="*in_ch",
+        w_dim0="*out_ch",
+        w_dim1="*kernel_d",
+        w_dim2="*kernel_h",
+        w_dim3="*kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+        div="/",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_d",
+        y_dim2="*out_h",
+        y_dim3="*out_w",
+        y_dim4="*out_ch",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = ""
+    for key in instances:
+        fname = "f" + sha1(key.encode()).hexdigest()
+        program = exec_template.render(indent="    ", instance=fname, dtype=dtype)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_paths += exec_inst
+    return src_template.render(
+        instances=instance_decl,
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        extra_header=extra_header,
+    )
+
+
+def cal_align_ab(x_shape: List[int], dtype="float16") -> int:
+    """Returns input alignment."""
+    k = x_shape[4]  # CI
+    return alignment.find_max_alignment(k, dtype)
+
+
+def function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    dtype = func_attrs["inputs"][0]._attrs["dtype"]
+    ab_alignment = cal_align_ab(x_shape, dtype=dtype)
+    tmp = cfg.split("_")
+    align_c = int(tmp[-1])
+    align_ab = int(tmp[-2])
+    if align_c != func_attrs["epilogue_alignment"]:
+        return False
+    if align_ab != ab_alignment:
+        return False
+    return True
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d.py b/python/aitemplate/backend/cuda/conv3d/conv3d.py
index 20a10e80b..07d9487ba 100644
--- a/python/aitemplate/backend/cuda/conv3d/conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d.py
@@ -17,10 +17,10 @@
 """
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301
 
@@ -34,27 +34,28 @@
 EXEC_TEMPLATE = jinja2.Template(
     """
 {{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
-//  TODO: cast to right dtype
+{{indent}}//  TODO: cast to right dtype
 {{indent}}typename {{instance}}::Arguments arguments{
-{{indent}}    problem_size,
-{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},
-{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},
-{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+{{indent}}    problem_size,                                            // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},             // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},         // TensorRefB const & ref_B
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_C
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_D
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params const & output_op
 {{indent}}};
-{{indent}}{{instance}} implicit_gemm_op;
 {% if is_profiler %}
-{{indent}}size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} conv_op;
 {% endif %}
-{{indent}}auto status = implicit_gemm_op.can_implement(arguments);
+{{indent}}auto status = conv_op.can_implement(arguments);
 {{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op.initialize(arguments, workspace);
+{{indent}}status = conv_op.initialize(arguments, workspace);
 {{indent}}CUTLASS_CHECK(status);
-{{indent}}status = implicit_gemm_op(stream);
+{{indent}}status = conv_op(stream);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}return;
 """
@@ -88,7 +89,13 @@
 
 {{instances_def}}
 
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
 void {{function_name}} (
+    {{instance_name_base}}& conv_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
     void* in_ptr,
     void* weight_ptr,
     void* out_ptr,
@@ -139,14 +146,14 @@
   TensorNDHWC layout_C(TensorNDHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_d, i32_out_h, i32_out_w, i32_out_ch)));
 
   cutlass::conv::Conv3dProblemSize problem_size(
-    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),
-    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),
-    cutlass::make_Coord(pad_d, pad_h, pad_w),
-    cutlass::make_Coord(stride_d, stride_h, stride_w),
-    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),
-    cutlass::conv::Mode::kCrossCorrelation,
-    1,
-    1
+    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),               // cutlass::Tensor5DCoord input_size
+    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),  // cutlass::Tensor5DCoord filter_size
+    cutlass::make_Coord(pad_d, pad_h, pad_w),                                                 // Coord3D padding
+    cutlass::make_Coord(stride_d, stride_h, stride_w),                                        // Coord3D stride
+    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),                                  // Coord3D dilation
+    cutlass::conv::Mode::kCrossCorrelation,                                                   // cutlass::conv::Mode mode
+    1,                                                                                        // int split_k_slices
+    1                                                                                         // int groups
   );
 
   {{exec_paths}}
@@ -157,6 +164,47 @@
 """
 )
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{instance_name}} {{conv_op}};
+{{indent}}  const char *conv_op_name = "{{conv_op_name}}";
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      {{conv_op}},
+{{indent}}      conv_op_name,
+{{indent}}      {{ni}},
+{{indent}}      {{di}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kd}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{do}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{stride_d}},
+{{indent}}      {{stride_h}},
+{{indent}}      {{stride_w}},
+{{indent}}      {{dilation_d}},
+{{indent}}      {{dilation_h}},
+{{indent}}      {{dilation_w}},
+{{indent}}      {{pad_d}},
+{{indent}}      {{pad_h}},
+{{indent}}      {{pad_w}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {}
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}}
+"""
+)
 
 PROFILER_TEMPLATE = jinja2.Template(
     """
@@ -164,6 +212,72 @@
 
 {{op_func}}
 
+template <typename {{instance_name_base}}>
+int benchmark_{{function_name}} (
+  {{instance_name_base}} &conv_op,
+  const char *conv_op_name,
+  int64_t NI,
+  int64_t DI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KD,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t DO,
+  int64_t HO,
+  int64_t WO,
+  int stride_d,
+  int stride_h,
+  int stride_w,
+  int dilation_d,
+  int dilation_h,
+  int dilation_w,
+  int pad_d,
+  int pad_h,
+  int pad_w,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementOutput = typename {{instance_name_base}}::ElementC;
+  using ElementInputA = typename {{instance_name_base}}::ElementA;
+  using ElementInputB = typename {{instance_name_base}}::ElementB;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name_base}}::LayoutA> x({NI, DI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> w({CO, KD, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name_base}}::LayoutC> y({NO, DO, HO, WO, CO});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << conv_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
 int main(int argc, char** argv) {
   int64_t batch = std::stoi(argv[1]);
   int64_t in_d = std::stoi(argv[2]);
@@ -183,98 +297,16 @@
   int dilation_d = std::stoi(argv[16]);
   int dilation_h = std::stoi(argv[17]);
   int dilation_w = std::stoi(argv[18]);
-  {{shape_func}}
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
 
-  uint8_t* global_workspace = nullptr;
+{{shape_func}}
+
+  uint8_t* global_workspace_ = nullptr;
   cudaStream_t stream = nullptr;
 
-  cutlass::HostTensor<ElementInputA, typename {{name}}::LayoutA> x({NI, DI, HI, WI, CI});
-  cutlass::HostTensor<ElementInputB, typename {{name}}::LayoutB> w({CO, KD, KH, KW, CI});
-  cutlass::HostTensor<ElementOutput, typename {{name}}::LayoutC> y({NO, DO, HO, WO, CO});
+{{benchmark_instances}}
 
-  //
-  // warmup
-  conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KD,
-       &KH,
-       &KW,
-       &DI,
-       &HI,
-       &WI,
-       &NO,
-       &DO,
-       &HO,
-       &WO,
-       stride_d,
-       stride_h,
-       stride_w,
-       dilation_d,
-       dilation_h,
-       dilation_w,
-       pad_d,
-       pad_h,
-       pad_w,
-       stream);
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0]);
-  for (int i = 0; i < 5; ++i) {
-      conv(x.device_data(),
-       w.device_data(),
-       y.device_data(),
-       global_workspace,
-       &NI,
-       &CO,
-       &CI,
-       &KD,
-       &KH,
-       &KW,
-       &DI,
-       &HI,
-       &WI,
-       &NO,
-       &DO,
-       &HO,
-       &WO,
-       stride_d,
-       stride_h,
-       stride_w,
-       dilation_d,
-       dilation_h,
-       dilation_w,
-       pad_d,
-       pad_h,
-       pad_w,
-       stream);
-  }
-  cudaEventRecord(events[1]);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
 }
-
 """
 )
 
@@ -315,6 +347,9 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    conv_op,
+{% endif %}
 {{indent}}    {{in_ptr}},
 {{indent}}    {{weight_ptr}},
 {{indent}}    {{out_ptr}},
@@ -350,14 +385,15 @@
 @registry.reg("cuda.conv3d.config")
 def conv3d_config(func_attrs, dtype="float16"):
     """Populates conv3d cutlass configs into 'op_instance' field."""
-    func_attrs["op_instance"] = common.extract_config(func_attrs)
+    func_attrs["op_instance"] = common.extract_config(func_attrs, dtype=dtype)
 
 
 @registry.reg("cuda.conv3d.gen_profiler")
-def gen_profiler(func_attrs, workdir, shape_template):
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
     """Codegen for conv3d profiler."""
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+
     # shape func
     shape_func = shape_template.render(
         indent="  ",
@@ -382,29 +418,113 @@ def gen_profiler(func_attrs, workdir, shape_template):
         pad_h="pad_h",
         pad_w="pad_w",
     )
+
     backend_spec = CUDASpec()
     dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-    file_pairs = []
-    for op_name, op in op_instance.items():
+    instance_name_base = "DeviceConvFwdInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        instance=instance_name_base,
+        dtype=dtype,
+    )
+
+    function_name = "conv"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(op)
         config_name = common.extract_config_name(config)
-        name = "DeviceConvFwdInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        conv_op = f"conv_op_{instance_idx}"
         instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = EXEC_TEMPLATE.render(
-            indent="  ", is_profiler=True, instance=name, dtype=dtype
+            config_name=config_name,
+            name=instance_name,
+            config=config,
         )
-        op_func = SRC_TEMPLATE.render(
-            instances=instance,
-            function_name="conv",
-            shape_func="",
-            exec_paths=exec_program,
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            conv_op=conv_op,
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            di="DI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kd="KD",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            do="DO",
+            ho="HO",
+            wo="WO",
+            stride_d="stride_d",
+            stride_h="stride_h",
+            stride_w="stride_w",
+            dilation_d="dilation_d",
+            dilation_h="dilation_h",
+            dilation_w="dilation_w",
+            pad_d="pad_d",
+            pad_h="pad_h",
+            pad_w="pad_w",
         )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func, shape_func=shape_func, name=name
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = SRC_TEMPLATE.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        shape_function="",
+        exec_paths=exec_program,
+    )
+    func_call = FUNC_CALL_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        func_name=function_name,
+        in_ptr="x.device_data()",
+        weight_ptr="w.device_data()",
+        out_ptr="y.device_data()",
+        p_batch="&NI",
+        p_out_ch="&CO",
+        p_in_ch="&CI",
+        p_kernel_d="&KD",
+        p_kernel_h="&KH",
+        p_kernel_w="&KW",
+        p_in_d="&DI",
+        p_in_h="&HI",
+        p_in_w="&WI",
+        p_out_batch="&NO",
+        p_out_d="&DO",
+        p_out_h="&HO",
+        p_out_w="&WO",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilation_d="dilation_d",
+        dilation_h="dilation_h",
+        dilation_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        shape_func=shape_func,
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        func_call=func_call,
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     # build
     return common.build_profiler(file_pairs)
 
@@ -412,7 +532,7 @@ def gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("cuda.conv3d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -422,7 +542,7 @@ def gen_function(
         INSTANCE_TEMPLATE,
         EXEC_TEMPLATE,
         SRC_TEMPLATE,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
new file mode 100644
index 000000000..a442b472b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/conv3d_bias.py
@@ -0,0 +1,624 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for conv3d.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
+
+# pylint: disable=C0103,C0415,W0613,C0301
+
+INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::conv::device::ImplicitGemmConvolution<{{config_name}}>;
+"""
+)
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementCompute;
+{{indent}}//  TODO: cast to right dtype
+{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}    problem_size,                                            // ConvProblemSize const & problem_size
+{{indent}}    {static_cast<{{dtype}}*>(in_ptr), layout_A},             // TensorRefA const & ref_A
+{{indent}}    {static_cast<{{dtype}}*>(weight_ptr), layout_B},         // TensorRefB const & ref_B
+{{indent}}    {static_cast<{{dtype}}*>(bias_ptr), cutlass::layout::TensorNDHWC::Stride(0)},
+{{indent}}    {static_cast<{{dtype}}*>(out_ptr), layout_C},            // TensorRefC const & ref_D
+{{indent}}    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params const & output_op
+{{indent}}};
+{% if is_profiler %}
+{{indent}}size_t workspace_size = conv_op.get_workspace_size(arguments);
+{{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
+{{indent}}workspace = local_workspace.get();
+{{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} conv_op;
+{% endif %}
+{{indent}}auto status = conv_op.can_implement(arguments);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op.initialize(arguments, workspace);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}status = conv_op(stream);
+{{indent}}CUTLASS_CHECK(status);
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <iostream>
+#include <string>
+#include <stdexcept>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv3d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+{{extra_header}}
+
+#define CUTLASS_CHECK(status)                                                         \\
+  {                                                                                   \\
+    cutlass::Status error = status;                                                   \\
+    if (error != cutlass::Status::kSuccess) {                                         \\
+      auto msg = std::string("[") + __FILE__ + "] Got cutlass error: " +              \\
+          cutlassGetStatusString(error) + " at: " + std::to_string(__LINE__);         \\
+      std::cerr << msg << std::endl;                                                  \\
+      throw std::runtime_error(msg);                                                  \\
+    }                                                                                 \\
+  }
+
+{{instances}}
+
+{{instances_def}}
+
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
+void {{function_name}} (
+    {{instance_name_base}}& conv_op,
+{% else %}
+void {{function_name}} (
+{% endif %}
+    void* in_ptr,
+    void* weight_ptr,
+    void* bias_ptr,
+    void* out_ptr,
+    uint8_t* workspace,
+    int64_t* batch,
+    int64_t* out_ch,
+    int64_t* in_ch,
+    int64_t* kernel_d,
+    int64_t* kernel_h,
+    int64_t* kernel_w,
+    int64_t* in_d,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_d,
+    int64_t* out_h,
+    int64_t* out_w,
+    int stride_d,
+    int stride_h,
+    int stride_w,
+    int dilation_d,
+    int dilation_h,
+    int dilation_w,
+    int pad_d,
+    int pad_h,
+    int pad_w,
+    cudaStream_t stream
+  ) {
+
+  {{shape_function}}
+  int i32_batch = *batch;
+  int i32_in_d = *in_d;
+  int i32_in_h = *in_h;
+  int i32_in_w = *in_w;
+  int i32_in_ch = *in_ch;
+  int i32_out_ch = *out_ch;
+  int i32_kernel_d = *kernel_d;
+  int i32_kernel_h = *kernel_h;
+  int i32_kernel_w = *kernel_w;
+  int i32_out_batch = *out_batch;
+  int i32_out_d = *out_d;
+  int i32_out_h = *out_h;
+  int i32_out_w = *out_w;
+
+  using cutlass::layout::TensorNDHWC;
+  TensorNDHWC layout_A(TensorNDHWC::packed(cutlass::make_Coord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch)));
+  TensorNDHWC layout_B(TensorNDHWC::packed(cutlass::make_Coord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch)));
+  TensorNDHWC layout_C(TensorNDHWC::packed(cutlass::make_Coord(i32_out_batch, i32_out_d, i32_out_h, i32_out_w, i32_out_ch)));
+
+  cutlass::conv::Conv3dProblemSize problem_size(
+    cutlass::Tensor5DCoord(i32_batch, i32_in_d, i32_in_h, i32_in_w, i32_in_ch),               // cutlass::Tensor5DCoord input_size
+    cutlass::Tensor5DCoord(i32_out_ch, i32_kernel_d, i32_kernel_h, i32_kernel_w, i32_in_ch),  // cutlass::Tensor5DCoord filter_size
+    cutlass::make_Coord(pad_d, pad_h, pad_w),                                                 // Coord3D padding
+    cutlass::make_Coord(stride_d, stride_h, stride_w),                                        // Coord3D stride
+    cutlass::make_Coord(dilation_d, dilation_h, dilation_w),                                  // Coord3D dilation
+    cutlass::conv::Mode::kCrossCorrelation,                                                   // cutlass::conv::Mode mode
+    1,                                                                                        // int split_k_slices
+    1                                                                                         // int groups
+  );
+
+  {{exec_paths}}
+  throw std::runtime_error(
+      "Unsupported workload for this conv3d specialization."
+  );
+}
+"""
+)
+
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{instance_name}} {{conv_op}};
+{{indent}}  const char *conv_op_name = "{{conv_op_name}}";
+{{indent}}  int ret = 0;
+{{indent}}  try {
+{{indent}}    ret = {{func_name}}(
+{{indent}}      {{conv_op}},
+{{indent}}      conv_op_name,
+{{indent}}      {{ni}},
+{{indent}}      {{di}},
+{{indent}}      {{hi}},
+{{indent}}      {{wi}},
+{{indent}}      {{ci}},
+{{indent}}      {{co}},
+{{indent}}      {{kd}},
+{{indent}}      {{kh}},
+{{indent}}      {{kw}},
+{{indent}}      {{no}},
+{{indent}}      {{do}},
+{{indent}}      {{ho}},
+{{indent}}      {{wo}},
+{{indent}}      {{stride_d}},
+{{indent}}      {{stride_h}},
+{{indent}}      {{stride_w}},
+{{indent}}      {{dilation_d}},
+{{indent}}      {{dilation_h}},
+{{indent}}      {{dilation_w}},
+{{indent}}      {{pad_d}},
+{{indent}}      {{pad_h}},
+{{indent}}      {{pad_w}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}    );
+{{indent}}  } catch (...) {}
+{{indent}}  if (ret != 0)
+{{indent}}    return ret;
+{{indent}}}
+"""
+)
+
+PROFILER_TEMPLATE = jinja2.Template(
+    """
+size_t GLOBAL_WORKSPACE_SIZE = 0;
+
+{{op_func}}
+
+template <typename {{instance_name_base}}>
+int benchmark_{{function_name}} (
+  {{instance_name_base}} &conv_op,
+  const char *conv_op_name,
+  int64_t NI,
+  int64_t DI,
+  int64_t HI,
+  int64_t WI,
+  int64_t CI,
+  int64_t CO,
+  int64_t KD,
+  int64_t KH,
+  int64_t KW,
+  int64_t NO,
+  int64_t DO,
+  int64_t HO,
+  int64_t WO,
+  int stride_d,
+  int stride_h,
+  int stride_w,
+  int dilation_d,
+  int dilation_h,
+  int dilation_w,
+  int pad_d,
+  int pad_h,
+  int pad_w,
+  uint8_t* global_workspace_,
+  cudaStream_t stream
+) {
+  using ElementOutput = typename {{instance_name_base}}::ElementC;
+  using ElementInputA = typename {{instance_name_base}}::ElementA;
+  using ElementInputB = typename {{instance_name_base}}::ElementB;
+
+  cutlass::HostTensor<ElementInputA, typename {{instance_name_base}}::LayoutA> x({NI, DI, HI, WI, CI});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> w({CO, KD, KH, KW, CI});
+  cutlass::HostTensor<ElementOutput, typename {{instance_name_base}}::LayoutC> y({NO, DO, HO, WO, CO});
+  cutlass::HostTensor<ElementInputB, typename {{instance_name_base}}::LayoutB> b({int(CO), 1, 1, 1, 1});
+
+  // warmup
+{{func_call}}
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 5; ++i) {
+{{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << conv_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  int64_t batch = std::stoi(argv[1]);
+  int64_t in_d = std::stoi(argv[2]);
+  int64_t in_h = std::stoi(argv[3]);
+  int64_t in_w = std::stoi(argv[4]);
+  int64_t in_ch = std::stoi(argv[5]);
+  int64_t kernel_d = std::stoi(argv[6]);
+  int64_t kernel_h = std::stoi(argv[7]);
+  int64_t kernel_w = std::stoi(argv[8]);
+  int64_t out_ch = std::stoi(argv[9]);
+  int stride_d = std::stoi(argv[10]);
+  int stride_h = std::stoi(argv[11]);
+  int stride_w = std::stoi(argv[12]);
+  int pad_d = std::stoi(argv[13]);
+  int pad_h = std::stoi(argv[14]);
+  int pad_w = std::stoi(argv[15]);
+  int dilation_d = std::stoi(argv[16]);
+  int dilation_h = std::stoi(argv[17]);
+  int dilation_w = std::stoi(argv[18]);
+
+{{shape_func}}
+
+  uint8_t* global_workspace_ = nullptr;
+  cudaStream_t stream = nullptr;
+
+{{benchmark_instances}}
+
+  return 0;
+}
+"""
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  void*,
+  void*,
+  uint8_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  int,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    conv_op,
+{% endif %}
+{{indent}}    {{in_ptr}},
+{{indent}}    {{weight_ptr}},
+{{indent}}    {{bias_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    global_workspace_,
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_out_ch}},
+{{indent}}    {{p_in_ch}},
+{{indent}}    {{p_kernel_d}},
+{{indent}}    {{p_kernel_h}},
+{{indent}}    {{p_kernel_w}},
+{{indent}}    {{p_in_d}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_d}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    {{stride_d}},
+{{indent}}    {{stride_h}},
+{{indent}}    {{stride_w}},
+{{indent}}    {{dilation_d}},
+{{indent}}    {{dilation_h}},
+{{indent}}    {{dilation_w}},
+{{indent}}    {{pad_d}},
+{{indent}}    {{pad_h}},
+{{indent}}    {{pad_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+@registry.reg("cuda.conv3d_bias.config")
+def conv3d_config(func_attrs, dtype="float16"):
+    """Populates conv3d cutlass configs into 'op_instance' field."""
+    func_attrs["op_instance"] = common.extract_config(func_attrs, dtype=dtype)
+
+
+@registry.reg("cuda.conv3d_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
+    """Codegen for conv3d profiler."""
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+
+    # shape func
+    shape_func = shape_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        div="/",
+        x_dim0="batch",
+        x_dim1="in_d",
+        x_dim2="in_h",
+        x_dim3="in_w",
+        x_dim4="in_ch",
+        w_dim0="out_ch",
+        w_dim1="kernel_d",
+        w_dim2="kernel_h",
+        w_dim3="kernel_w",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilate_d="dilation_d",
+        dilate_h="dilation_h",
+        dilate_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    instance_name_base = "DeviceConvFwdInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        instance=instance_name_base,
+        dtype=dtype,
+    )
+
+    function_name = "conv"
+    instances = []
+    benchmark_instances = []
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
+        config = common.emit_instance(op)
+        config_name = common.extract_config_name(config)
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        conv_op = f"conv_op_{instance_idx}"
+        instance = INSTANCE_TEMPLATE.render(
+            config_name=config_name,
+            name=instance_name,
+            config=config,
+        )
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
+            indent="  ",
+            instance_name=instance_name,
+            conv_op=conv_op,
+            conv_op_name=op_name,
+            func_name=f"benchmark_{function_name}",
+            ni="NI",
+            di="DI",
+            hi="HI",
+            wi="WI",
+            ci="CI",
+            co="CO",
+            kd="KD",
+            kh="KH",
+            kw="KW",
+            no="NO",
+            do="DO",
+            ho="HO",
+            wo="WO",
+            stride_d="stride_d",
+            stride_h="stride_h",
+            stride_w="stride_w",
+            dilation_d="dilation_d",
+            dilation_h="dilation_h",
+            dilation_w="dilation_w",
+            pad_d="pad_d",
+            pad_h="pad_h",
+            pad_w="pad_w",
+        )
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = SRC_TEMPLATE.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        shape_function="",
+        exec_paths=exec_program,
+    )
+    func_call = FUNC_CALL_TEMPLATE.render(
+        indent="  ",
+        is_profiler=True,
+        func_name=function_name,
+        in_ptr="x.device_data()",
+        weight_ptr="w.device_data()",
+        bias_ptr="b.device_data()",
+        out_ptr="y.device_data()",
+        p_batch="&NI",
+        p_out_ch="&CO",
+        p_in_ch="&CI",
+        p_kernel_d="&KD",
+        p_kernel_h="&KH",
+        p_kernel_w="&KW",
+        p_in_d="&DI",
+        p_in_h="&HI",
+        p_in_w="&WI",
+        p_out_batch="&NO",
+        p_out_d="&DO",
+        p_out_h="&HO",
+        p_out_w="&WO",
+        stride_d="stride_d",
+        stride_h="stride_h",
+        stride_w="stride_w",
+        dilation_d="dilation_d",
+        dilation_h="dilation_h",
+        dilation_w="dilation_w",
+        pad_d="pad_d",
+        pad_h="pad_h",
+        pad_w="pad_w",
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        shape_func=shape_func,
+        instance_name_base=instance_name_base,
+        function_name=function_name,
+        func_call=func_call,
+        benchmark_instances="\n".join(benchmark_instances),
+    )
+
+    # FIXME: remove file_pairs once we have make -j ready for building
+    # an entire graph
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
+    # build
+    return common.build_profiler(file_pairs)
+
+
+@registry.reg("cuda.conv3d_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    shape_eval_template,
+    shape_save_template,
+):
+    """Codegen for conv3d_bias function."""
+    return common.gen_function(
+        func_attrs,
+        INSTANCE_TEMPLATE,
+        EXEC_TEMPLATE,
+        SRC_TEMPLATE,
+        exec_cond_template,
+        shape_eval_template,
+        shape_save_template,
+    )
+
+
+@registry.reg("cuda.conv3d_bias.func_decl")
+def conv3d_gen_function_decl(func_attrs):
+    """Codegen for conv3d function declaration."""
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.conv3d_bias.func_call")
+def conv3d_gen_function_call(func_attrs, indent="  "):
+    """Codegen for conv3d function call."""
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    w = func_attrs["inputs"][1]
+    wshape = w._attrs["shape"]
+    b = func_attrs["inputs"][2]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        weight_ptr=w._attrs["name"],
+        bias_ptr=b._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_out_ch="&" + wshape[0]._attrs["name"],
+        p_in_ch="&" + xshape[4]._attrs["name"],
+        p_kernel_d="&" + wshape[1]._attrs["name"],
+        p_kernel_h="&" + wshape[2]._attrs["name"],
+        p_kernel_w="&" + wshape[3]._attrs["name"],
+        p_in_d="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_d="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        stride_d=func_attrs["stride"][0],
+        stride_h=func_attrs["stride"][1],
+        stride_w=func_attrs["stride"][2],
+        dilation_d=func_attrs["dilate"][0],
+        dilation_h=func_attrs["dilate"][1],
+        dilation_w=func_attrs["dilate"][2],
+        pad_d=func_attrs["pad"][0],
+        pad_h=func_attrs["pad"][1],
+        pad_w=func_attrs["pad"][2],
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.conv3d_bias.filter")
+def conv3d_function_filter(cfg, func_attrs, x_shape):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    x_shape:
+        Input shapes.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, x_shape)
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
index 92158b6ae..399c88d79 100644
--- a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d.py
@@ -17,8 +17,10 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -40,7 +42,7 @@
 template <typename scalar_t, typename accscalar_t, typename Telement, int element_in_Tio, int kernel_k, int dil_d>
 __global__ void conv_depthwise3d_cuda_kernel(
     const scalar_t * input,
-    const half* kernel,
+    const {{dtype}}* kernel,
     scalar_t * output,
     int _kT, int _kH, int _kW,
     int strideT, int strideH, int strideW,
@@ -81,7 +83,7 @@
     for (int tk = 0; tk < element_in_Tio; tk++){
         sum[tk] = 0;
     }
-    const half *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
+    const {{dtype}} *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
     const scalar_t *input_ptr = input + in_offset;
     for (int k_frame = 0; k_frame < kT; ++k_frame) {
       const int in_frame = in_frame_start + k_frame * dilationT;
@@ -95,8 +97,13 @@
             Telement* pack_input = reinterpret_cast<Telement*>(&input_val);
 
             for (int tk = 0; tk < element_in_Tio; tk++){
+              {% if dtype == "half" %}
                 accscalar_t op1 = __half2float(pack_input[tk]);
                 sum[tk] += op1 * __half2float(kernel_ptr[tk*kT*kH*kW]);
+              {% elif dtype == "float" %}
+                accscalar_t op1 = pack_input[tk];
+                sum[tk] += op1 * kernel_ptr[tk*kT*kH*kW];
+              {% endif %}
             }
           }
           kernel_ptr += 1;
@@ -110,7 +117,11 @@
     scalar_t output_val;
     Telement* pack_output = reinterpret_cast<Telement*>(&output_val);
     for (int tk = 0; tk < element_in_Tio; tk++){
+      {% if dtype == "half" %}
         pack_output[tk] = __float2half(sum[tk]);
+      {% elif dtype == "float" %}
+        pack_output[tk] = sum[tk];
+      {% endif %}
     }
     output[out_offset] = output_val;
   }
@@ -159,9 +170,9 @@
 
 
 void conv_depthwise3d_launcher(
-    const half * input,
-    const half * weight,
-    half * output,
+    const {{dtype}} * input,
+    const {{dtype}} * weight,
+    {{dtype}} * output,
     int kernel_t,
     int kernel_h,
     int kernel_w,
@@ -214,6 +225,7 @@
 
 
   using accscalar_t = float;
+{% if dtype == "half" %}
   using Telement = half;
   {% if csize == 0 %}
     using scalar_t = float4;
@@ -229,6 +241,18 @@
     using scalar_t = half;
     #define element_in_Tio 1
   {% endif %}
+{% elif dtype == "float" %}
+  using Telement = float;
+  {% if csize == 2 %}
+    using scalar_t = float2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = float;
+    #define element_in_Tio 1
+  {% endif %}
+{% endif %}
 
   DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 1)
   DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, 1)
@@ -281,9 +305,9 @@
   int out_w = *p_out_w;
 
   conv_depthwise3d_launcher(
-    (const half*)in_ptr,
-    (const half*)weight_ptr,
-    (half*)out_ptr,
+    (const {{dtype}}*)in_ptr,
+    (const {{dtype}}*)weight_ptr,
+    ({{dtype}}*)out_ptr,
     kt,
     kh,
     kw,
@@ -317,7 +341,15 @@
 def gen_function(func_attrs):
     func_name = func_attrs["name"]
     csize = func_attrs["group"] % 8
-    return SRC_TEMPLATE.render(function_name=func_name, csize=csize)
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        csize=csize,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.depthwise_conv3d.func_decl")
diff --git a/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
new file mode 100644
index 000000000..70f46eff8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/conv3d/depthwise_conv3d_bias.py
@@ -0,0 +1,396 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for depthwise_conv3d_bias.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.conv3d import common_bias
+
+# pylint: disable=C0103,C0415,W0613,C0301,W0612
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+#include <algorithm>
+#include <limits>
+#include <assert.h>
+
+namespace {
+#define CUDA_KERNEL_LOOP(i, n)                                                                          \\
+    int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;                                         \\
+    for (int64_t i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+
+template <typename scalar_t, typename accscalar_t, typename Telement, int element_in_Tio, int kernel_k, int dil_d>
+__global__ void conv_depthwise3d_cuda_kernel(
+    const scalar_t * input,
+    const {{dtype}}* kernel,
+    {% if has_bias %}
+    const {{dtype}}* bias,
+    {% endif %}
+    scalar_t * output,
+    int _kT, int _kH, int _kW,
+    int strideT, int strideH, int strideW,
+    int paddingT, int paddingH, int paddingW,
+    int _dilationT, int _dilationH, int _dilationW,
+    int iC, int iT, int iH, int iW,
+    int oT, int oH, int oW,
+    int num_outputs)
+{
+  int kT = kernel_k > 0? kernel_k: _kT;
+  int kH = kernel_k > 0? kernel_k: _kH;
+  int kW = kernel_k > 0? kernel_k: _kW;
+
+  int dilationT = dil_d > 0? dil_d: _dilationT;
+  int dilationH = dil_d > 0? dil_d: _dilationH;
+  int dilationW = dil_d > 0? dil_d: _dilationW;
+
+  const int oC = iC;
+  const int channel_multiplier = 1;
+
+  CUDA_KERNEL_LOOP(index, num_outputs) {
+    const int out_channel = index  % oC;
+    const int out_col = (index / oC) % oW;
+    const int out_row = (index / oC / oW) % oH;
+    const int out_frame = (index / oC / oW / oH) % oT;
+    const int batch = index / oC / oW / oH / oT;
+
+    const int in_channel = out_channel / channel_multiplier;
+
+    const int in_col_start = out_col * strideW - paddingW;
+    const int in_row_start = out_row * strideH - paddingH;
+    const int in_frame_start = out_frame * strideT - paddingT;
+
+    const int in_offset = in_channel + iC * (in_col_start + iW * (in_row_start + iH * (in_frame_start + iT* batch)));
+    const int out_offset = out_channel + oC * (out_col + oW * (out_row + oH * (out_frame + oT* batch)));
+
+    accscalar_t sum[element_in_Tio];
+    for (int tk = 0; tk < element_in_Tio; tk++){
+        sum[tk] = 0;
+    }
+    const {{dtype}} *kernel_ptr = kernel + out_channel * element_in_Tio * kT * kH * kW;
+    const scalar_t *input_ptr = input + in_offset;
+    for (int k_frame = 0; k_frame < kT; ++k_frame) {
+      const int in_frame = in_frame_start + k_frame * dilationT;
+      for (int k_row = 0; k_row < kH; ++k_row) {
+        const int in_row = in_row_start + k_row * dilationH;
+        for (int k_col = 0; k_col < kW; ++k_col) {
+          const int in_col = in_col_start + k_col * dilationW;
+          if (in_frame >= 0 && in_row >= 0 && in_col >= 0 &&
+              in_frame < iT && in_row < iH && in_col < iW) {
+            scalar_t input_val = __ldg(input_ptr);
+            Telement* pack_input = reinterpret_cast<Telement*>(&input_val);
+
+            for (int tk = 0; tk < element_in_Tio; tk++){
+              {% if dtype == "half" %}
+                accscalar_t op1 = __half2float(pack_input[tk]);
+                sum[tk] += op1 * __half2float(kernel_ptr[tk*kT*kH*kW]);
+              {% elif dtype == "float" %}
+                accscalar_t op1 = pack_input[tk];
+                sum[tk] += op1 * kernel_ptr[tk*kT*kH*kW];
+              {% endif %}
+            }
+          }
+          kernel_ptr += 1;
+          input_ptr += dilationW * iC;
+        }
+        input_ptr += iC * (iW * dilationH - kW * dilationW);
+      }
+      input_ptr += iC * iW * (iH * dilationT - kH * dilationH);
+    }
+
+    {% if has_bias %}
+      const {{dtype}} *bias_ptr = bias + out_channel * element_in_Tio;
+    {% endif %}
+
+
+    scalar_t output_val;
+    Telement* pack_output = reinterpret_cast<Telement*>(&output_val);
+    for (int tk = 0; tk < element_in_Tio; tk++){
+      {% if dtype == "half" %}
+        {% if has_bias %}
+          pack_output[tk] = __float2half(sum[tk]) + bias_ptr[tk];
+        {% else %}
+          pack_output[tk] = __float2half(sum[tk]);
+        {% endif %}
+      {% elif dtype == "float" %}
+        {% if has_bias %}
+          pack_output[tk] = sum[tk] + bias_ptr[tk];
+        {% else %}
+          pack_output[tk] = sum[tk];
+        {% endif %}
+      {% endif %}
+    }
+    output[out_offset] = output_val;
+  }
+}
+
+#define NODEF_OR_EQUAL(x, y) ((y) < 0 || (x) == (y))
+#define NODEF_OR_EQUAL_3(x, y1, y2, y3) \\
+  (NODEF_OR_EQUAL(x, y1) && \\
+   NODEF_OR_EQUAL(x, y2) && \\
+   NODEF_OR_EQUAL(x, y3))
+
+
+#define DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(kernel_k, dil_d)                       \\
+  if (NODEF_OR_EQUAL_3(kernel_k, (kernel_t), (kernel_h), (kernel_w)) &&                 \\
+      NODEF_OR_EQUAL_3(dil_d, (dilation_t), (dilation_h), (dilation_w))) {              \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, kernel_k, dil_d>  \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      {% if has_bias %}                                                 \\
+      bias,                                                             \\
+      {% endif %}                                                       \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);                                                     \\
+  } else                                                                \\
+
+#define DWCONV3D_FORWARD_DISPATCH_OTHERS                                \\
+  {                                                                     \\
+    conv_depthwise3d_cuda_kernel                                        \\
+    <scalar_t, accscalar_t, Telement, element_in_Tio, -1, -1>           \\
+    <<<grid, block, (smem), stream>>>(                                  \\
+      (const scalar_t *)input,                                          \\
+      weight,                                                           \\
+      {% if has_bias %}                                                 \\
+      bias,                                                             \\
+      {% endif %}                                                       \\
+      (scalar_t *)output,                                               \\
+      kernel_t, kernel_h, kernel_w,                                     \\
+      stride_t, stride_h, stride_w,                                     \\
+      padding_t, padding_h, padding_w,                                  \\
+      dilation_t, dilation_h, dilation_w,                               \\
+      c, t, h, w,                                                       \\
+      to, ho, wo,                                                       \\
+      num_outputs);}                                                    \\
+
+
+void conv_depthwise3d_launcher(
+    const {{dtype}} * input,
+    const {{dtype}} * weight,
+    {% if has_bias %}
+    const {{dtype}} * bias,
+    {% endif %}
+    {{dtype}} * output,
+    int kernel_t,
+    int kernel_h,
+    int kernel_w,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int n,
+    int c,
+    int t,
+    int h,
+    int w,
+    int to,
+    int ho,
+    int wo,
+    cudaStream_t stream
+    ) {
+
+  assert(to > 0);
+  assert(ho > 0);
+  assert(wo > 0);
+
+  int64_t num_outputs = n * to * ho * wo * c;
+  int64_t block = 256;
+  int64_t grid = std::min((num_outputs - 1) / block + 1, (int64_t)65536);
+
+  int64_t num_inputs = n * t * h * w * c;
+  int64_t num_weights = c * kernel_t * kernel_h * kernel_w;
+  int64_t smem = 0;
+
+  // Range check to avoid overflow in CUDA kernels.
+  assert((num_inputs <= std::numeric_limits<int32_t>::max()) &&
+              "Input tensor is too large.");
+  assert((num_outputs <= std::numeric_limits<int32_t>::max()) &&
+              "Output tensor is too large.");
+  assert((num_weights <= 1024*8) &&
+              "Weight tensor is too large.");
+
+  assert((padding_t * 2 + t <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_h * 2 + h <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+  assert((padding_w * 2 + w <= std::numeric_limits<int32_t>::max()) &&
+                "Padded input tensor is too large.");
+
+
+  using accscalar_t = float;
+{% if dtype == "half" %}
+  using Telement = half;
+  {% if csize == 0 %}
+    using scalar_t = float4;
+    c = c/8;
+    num_outputs = num_outputs/8;
+    #define element_in_Tio 8
+  {% elif csize == 2 %}
+    using scalar_t = half2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = half;
+    #define element_in_Tio 1
+  {% endif %}
+{% elif dtype == "float" %}
+  using Telement = float;
+  {% if csize == 2 %}
+    using scalar_t = float2;
+    c =c/2;
+    num_outputs = num_outputs/2;
+    #define element_in_Tio 2
+  {% else %}
+    using scalar_t = float;
+    #define element_in_Tio 1
+  {% endif %}
+{% endif %}
+
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(3, 1)
+  DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION(-1, 1)
+  DWCONV3D_FORWARD_DISPATCH_OTHERS
+}
+
+#undef DWCONV3D_FORWARD_DISPATCH_SPECIALIZATION
+#undef DWCONV3D_FORWARD_DISPATCH_OTHERS
+#undef CUDA_KERNEL_LOOP
+} // namespace
+
+void {{function_name}} (
+    void* in_ptr,
+    void* weight_ptr,
+{% if has_bias %}
+    void* bias_ptr,
+{% endif %}
+    void* out_ptr,
+    int64_t* p_kt,
+    int64_t* p_kh,
+    int64_t* p_kw,
+    int stride_t,
+    int stride_h,
+    int stride_w,
+    int padding_t,
+    int padding_h,
+    int padding_w,
+    int dilation_t,
+    int dilation_h,
+    int dilation_w,
+    int64_t* p_batch,
+    int64_t* p_in_ch,
+    int64_t* p_in_t,
+    int64_t* p_in_h,
+    int64_t* p_in_w,
+    int64_t* p_out_ch,
+    int64_t* p_out_t,
+    int64_t* p_out_h,
+    int64_t* p_out_w,
+    cudaStream_t stream
+) {
+  int kt = *p_kt;
+  int kh = *p_kh;
+  int kw = *p_kw;
+  int batch = *p_batch;
+  int in_ch = *p_in_ch;
+  int in_t = *p_in_t;
+  int in_h = *p_in_h;
+  int in_w = *p_in_w;
+  int out_ch = *p_out_ch;
+  int out_t = *p_out_t;
+  int out_h = *p_out_h;
+  int out_w = *p_out_w;
+
+  conv_depthwise3d_launcher(
+    (const {{dtype}}*)in_ptr,
+    (const {{dtype}}*)weight_ptr,
+    {% if has_bias %}
+    (const {{dtype}}*)bias_ptr,
+    {% endif %}
+    ({{dtype}}*)out_ptr,
+    kt,
+    kh,
+    kw,
+    stride_t,
+    stride_h,
+    stride_w,
+    padding_t,
+    padding_h,
+    padding_w,
+    dilation_t,
+    dilation_h,
+    dilation_w,
+    batch,
+    in_ch,
+    in_t,
+    in_h,
+    in_w,
+    out_t,
+    out_h,
+    out_w,
+    stream
+  );
+
+  return;
+}
+"""
+)
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.gen_function")
+def gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    has_bias = func_attrs["bias"]
+    csize = func_attrs["group"] % 8
+
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        csize=csize,
+        has_bias=has_bias,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return common_bias.gen_function_decl(func_name)
+
+
+@registry.reg("cuda.depthwise_conv3d_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias.gen_function_call(func_attrs, indent)
diff --git a/python/aitemplate/backend/cuda/cuda_common.py b/python/aitemplate/backend/cuda/cuda_common.py
index 20093b05c..adb09af10 100644
--- a/python/aitemplate/backend/cuda/cuda_common.py
+++ b/python/aitemplate/backend/cuda/cuda_common.py
@@ -19,6 +19,7 @@
 
 DTYPE_TO_CUDATYPE: Dict[str, str] = {
     "float16": "half",
+    "float32": "float",
     "float": "float",
     "int64": "int64_t",
 }
diff --git a/python/aitemplate/backend/cuda/elementwise/__init__.py b/python/aitemplate/backend/cuda/elementwise/__init__.py
index 18bff2803..545d10a86 100644
--- a/python/aitemplate/backend/cuda/elementwise/__init__.py
+++ b/python/aitemplate/backend/cuda/elementwise/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import fused_elementwise, int_elementwise
+from aitemplate.backend.cuda.elementwise import fused_elementwise, int_elementwise
 
 __all__ = ["fused_elementwise", "int_elementwise"]
diff --git a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh b/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
deleted file mode 100644
index 07d1650f5..000000000
--- a/python/aitemplate/backend/cuda/elementwise/custom_math.cuh
+++ /dev/null
@@ -1,457 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
-#ifndef CUSTOM_MATH
-#define CUSTOM_MATH
-
-#ifndef __HALF2_TO_UI
-#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
-#endif
-
-#ifndef __HALF_TO_US
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
-#endif
-
-template <typename T>
-__device__ T sign_custom(const T a) {
-  return T(a > T(0)) - T(a < T(0));
-}
-
-__device__ half2 h2sign_custom(const half2 a) {
-  return half2(sign_custom(a.x), sign_custom(a.y));
-}
-
-__device__ half2 fast_tanh(half2 x) {
-#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
-    (__CUDA_ARCH__ >= 750)
-
-  asm volatile("tanh.approx.f16x2 %0, %1;"
-               : "=r"(__HALF2_TO_UI(x))
-               : "r"(__HALF2_TO_UI(x)));
-  return x;
-
-#else
-  CUTLASS_NOT_IMPLEMENTED();
-#endif
-}
-
-__device__ half fast_tanh(half x) {
-#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
-    (__CUDA_ARCH__ >= 750)
-
-  asm volatile("tanh.approx.f16 %0, %1;"
-               : "=h"(__HALF_TO_US(x))
-               : "h"(__HALF_TO_US(x)));
-  return x;
-
-#else
-  return half(cutlass::fast_tanh(float(x)));
-#endif
-}
-
-// Return 1
-__device__ half one() {
-  uint16_t bits = 0x3c00u;
-  return reinterpret_cast<half const&>(bits);
-}
-
-/// Returns (1/2)  (specialization for half_t)
-__device__ half constant_half() {
-  uint16_t bits = 0x3800u;
-  return reinterpret_cast<half const&>(bits);
-}
-
-__device__ float fsigmoid_custom(const float a) {
-  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
-}
-
-__device__ half hsigmoid_custom(const half a) {
-  half half_val = constant_half();
-  half one_val = one();
-  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
-}
-
-__device__ half2 h2sigmoid_custom(const half2 a) {
-  half2 halfX2 = half2(constant_half(), constant_half());
-  half2 oneX2 = half2(one(), one());
-  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
-}
-
-__device__ float fsilu(const float a) {
-  return a * fsigmoid_custom(a);
-}
-
-__device__ half hsilu(const half a) {
-  return __hmul(a, hsigmoid_custom(a));
-}
-
-__device__ half2 h2silu(const half2 a) {
-  return __hmul2(a, h2sigmoid_custom(a));
-}
-
-__device__ float leaky_relu(const float a, const float negativeSlope) {
-  return a > 0.f ? a : a * negativeSlope;
-}
-
-__device__ half leaky_relu(const half a, const half negativeSlope) {
-  return a > half(0.f) ? a : __hmul(a, negativeSlope);
-}
-
-__device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
-  return half2(
-      leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
-}
-
-__device__ float relu(const float a) {
-  return a > 0.f ? a : 0.f;
-}
-
-__device__ half relu(const half a) {
-  return a > half(0.f) ? a : half(0.f);
-}
-
-__device__ half2 relu(const half2 a) {
-  half2 zeroX2 = half2(half(0.f), half(0.f));
-#if __CUDA_ARCH__ >= 800
-  return __hmax2(a, zeroX2);
-#else
-  return half2(relu(a.x), relu(a.y));
-#endif
-}
-
-template <typename T>
-__device__ T hard_tanh(const T a, T min_val, T max_val) {
-  if (a <= min_val) {
-    return min_val;
-  } else if (a >= max_val) {
-    return max_val;
-  } else {
-    return a;
-  }
-}
-
-__device__ half2
-h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
-  return half2(
-      hard_tanh(a.x, min_val.x, max_val.x),
-      hard_tanh(a.y, min_val.y, max_val.y));
-}
-
-__device__ half replace_if_inf(
-    const half a,
-    const half inf_replace,
-    const half neginf_replace) {
-  auto is_inf = __hisinf(a);
-  if (is_inf == -1) {
-    return neginf_replace;
-  }
-  if (is_inf == 1) {
-    return inf_replace;
-  }
-  return a;
-}
-
-__device__ float replace_if_inf(
-    const float a,
-    const float inf_replace,
-    const float neginf_replace) {
-  auto is_inf = isinf(a);
-  if (is_inf == -1) {
-    return neginf_replace;
-  }
-  if (is_inf == 1) {
-    return inf_replace;
-  }
-  return a;
-}
-
-__device__ half2 nan_to_num(
-    const half2 a,
-    const half2 nan_replace,
-    const half2 inf_replace,
-    const half2 neginf_replace) {
-  half2 isnan = __hisnan2(a);
-  return half2(
-      isnan.x ? nan_replace.x
-              : replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
-      isnan.y ? nan_replace.y
-              : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
-}
-
-__device__ half nan_to_num(
-    const half a,
-    const half nan_replace,
-    const half inf_replace,
-    const half neginf_replace) {
-  if (__hisnan(a)) {
-    return nan_replace;
-  }
-  return replace_if_inf(a, inf_replace, neginf_replace);
-}
-
-__device__ float nan_to_num(
-    const float a,
-    const float nan_replace,
-    const float inf_replace,
-    const float neginf_replace) {
-  if (isnan(a)) {
-    return nan_replace;
-  }
-  return replace_if_inf(a, inf_replace, neginf_replace);
-}
-
-__device__ half2 clamp_nan_to_num(
-    const half2 a,
-    const half2 clamp_min,
-    const half2 clamp_max,
-    const half2 nan_replace) {
-  half2 isnan = __hisnan2(a);
-  return half2(
-      isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
-      isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
-}
-
-__device__ half clamp_nan_to_num(
-    const half a,
-    const half clamp_min,
-    const half clamp_max,
-    const half nan_replace) {
-  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
-}
-
-__device__ float clamp_nan_to_num(
-    const float a,
-    const float clamp_min,
-    const float clamp_max,
-    const float nan_replace) {
-  return isnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
-}
-
-// Backup functions for CUDA_ARCH < 800
-__device__ half nanh() {
-  return __float2half(nanf(""));
-}
-
-__device__ bool half_isnan(half h) {
-  return h != h;
-}
-
-__device__ half hmin(half a, half b) {
-  return (a < b) ? a : b;
-}
-
-__device__ half hmax(half a, half b) {
-  return (a > b) ? a : b;
-}
-
-// max/min functions that let NaNs pass through
-__device__ float fmaxf_nan(const float a, const float b) {
-  return (isnan(a) || isnan(b)) ? nanf("") : fmaxf(a, b);
-}
-
-__device__ half hmax_nan(const half a, const half b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hmax_nan(a, b);
-#else
-  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmax(a, b);
-#endif
-}
-
-__device__ half2 hmax2_nan(const half2 a, const half2 b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hmax2_nan(a, b);
-#else
-  return half2(hmax_nan(a.x, b.x), hmax_nan(a.y, b.y));
-#endif
-}
-
-__device__ float fminf_nan(const float a, const float b) {
-  return (isnan(a) || isnan(b)) ? nanf("") : fminf(a, b);
-}
-
-__device__ half hmin_nan(const half a, const half b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hmin_nan(a, b);
-#else
-  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmin(a, b);
-#endif
-}
-
-__device__ half2 hmin2_nan(const half2 a, const half2 b) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-  return __hmin2_nan(a, b);
-#else
-  return half2(hmin_nan(a.x, b.x), hmin_nan(a.y, b.y));
-#endif
-}
-
-// pow impl
-__device__ half hpow(const half a, const half b);
-
-__device__ half2 h2pow(const half2 a, const half2 b) {
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  if (b1 != b2) {
-    half a1 = __low2half(a);
-    half a2 = __high2half(a);
-    half c1 = hpow(a1, b1);
-    half c2 = hpow(a2, b2);
-    return __halves2half2(c1, c2);
-  }
-
-  // New special cases can be added if needed, such as
-  // an powi for cases where b is an integer
-  if (__hbeq2(b, half2(0.0, 0.0))) {
-    return half2(1.0, 1.0);
-  }
-  if (__hbeq2(b, half2(1.0, 1.0))) {
-    return a;
-  }
-  if (__hbeq2(b, half2(2.0, 2.0))) {
-    return __hmul2(a, a);
-  }
-  if (__hbeq2(b, half2(3.0, 3.0))) {
-    return __hmul2(__hmul2(a, a), a);
-  }
-  if (__hbeq2(b, half2(0.5, 0.5))) {
-    return h2sqrt(a);
-  }
-  if (__hbeq2(b, half2(-0.5, -0.5))) {
-    return h2rsqrt(a);
-  }
-  if (__hbeq2(b, half2(-1.0, -1.0))) {
-    return __h2div(half2(1.0, 1.0), a);
-  }
-  if (__hbeq2(b, half2(-2.0, -2.0))) {
-    return __h2div(half2(1.0, 1.0), __hmul2(a, a));
-  }
-
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-
-  // low 16 bits
-  half c1 =
-      static_cast<half>(pow(static_cast<double>(a1), static_cast<double>(b1)));
-  // high 16 bits
-  half c2 =
-      static_cast<half>(pow(static_cast<double>(a2), static_cast<double>(b2)));
-  return __halves2half2(c1, c2);
-}
-
-__device__ half hpow(const half a, const half b) {
-  if (b == half(0.0)) {
-    return half(1.0);
-  }
-  if (b == half(1.0)) {
-    return a;
-  }
-  if (b == half(2.0)) {
-    return a * a;
-  }
-  if (b == half(3.0)) {
-    return a * a * a;
-  }
-  if (b == half(0.5)) {
-    return hsqrt(a);
-  }
-  if (b == half(-0.5)) {
-    return hrsqrt(a);
-  }
-  if (b == half(-1.0)) {
-    return half(1.0) / a;
-  }
-  if (b == half(-2.0)) {
-    return half(1.0) / (a * a);
-  }
-  return static_cast<half>(pow(static_cast<double>(a), static_cast<double>(b)));
-}
-
-__device__ float fpow(const float a, const float b) {
-  if (b == float(0.0)) {
-    return float(1.0);
-  }
-  if (b == float(1.0)) {
-    return a;
-  }
-  if (b == float(2.0)) {
-    return a * a;
-  }
-  if (b == float(3.0)) {
-    return a * a * a;
-  }
-  if (b == float(0.5)) {
-    return sqrt(a);
-  }
-  if (b == float(-0.5)) {
-    return rsqrt(a);
-  }
-  if (b == float(-1.0)) {
-    return float(1.0) / a;
-  }
-  if (b == float(-2.0)) {
-    return float(1.0) / (a * a);
-  }
-  return static_cast<float>(
-      pow(static_cast<double>(a), static_cast<double>(b)));
-}
-
-//
-// GELU function definitions implemented as described by
-//   Hendrycks, D., and Gimpel, K. in
-//   "Gaussian Error Linear Units (GELUs)." (2020)
-//   https://arxiv.org/pdf/1606.08415.pdf
-//
-// Floating-point constants are Taylor coefficients described in the paper.
-//
-__device__ half hgelu(const half a) {
-  cutlass::epilogue::thread::GELU<cutlass::half_t> gelu_op;
-  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
-}
-
-__device__ float fgelu(const float a) {
-  cutlass::epilogue::thread::GELU<float> gelu_op;
-  return gelu_op(a);
-}
-
-__device__ half h_fast_gelu(const half a) {
-  cutlass::epilogue::thread::GELU_taylor<cutlass::half_t> gelu_op;
-  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
-}
-
-__device__ float f_fast_gelu(const float a) {
-  cutlass::epilogue::thread::GELU_taylor<float> gelu_op;
-  return gelu_op(a);
-}
-
-__device__ float fsoftplus(
-    const float a,
-    const float beta,
-    const float threshold) {
-  return (a * beta > threshold) ? a : log1pf(expf(a * beta)) / beta;
-}
-
-__device__ half hsoftplus(const half a, const half beta, const half threshold) {
-  half one_val = one();
-  return __hgt(__hmul(a, beta), threshold)
-      ? a
-      : __hdiv(hlog(__hadd(one_val, hexp(__hmul(a, beta)))), beta);
-}
-
-__device__ half2
-h2softplus(const half2 a, const half2 beta, const half2 threshold) {
-  return half2(
-      hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
-}
-
-#endif
diff --git a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
index 667310726..ad03f6680 100644
--- a/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/fused_elementwise.py
@@ -19,26 +19,26 @@
 import os
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import elementwise_common
-from ...target import Target
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import elementwise_common
+from aitemplate.backend.target import Target
 
 HEAD_TEMPLATE = """
-#include <cuda_fp16.hpp>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "cutlass/constants.h"
 #include "cutlass/epilogue/thread/activation.h"
+#include "math_constants.h"
 """
 
 
 @registry.reg("cuda.fused_elementwise.gen_function")
 def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
     """Generates fused_elementwise function definition."""
-    custom_libs = Target.current().get_custom_libs(
-        os.path.dirname(__file__), "custom_math.cuh"
-    )
+    custom_libs = '#include "custom_math.cuh"'
     return elementwise_common.fused_elementwise_gen_function(
         func_attrs=func_attrs,
         custom_libs=custom_libs,
diff --git a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
index 8bd6fc5a8..8ed55cfd4 100644
--- a/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
+++ b/python/aitemplate/backend/cuda/elementwise/int_elementwise.py
@@ -18,10 +18,11 @@
 
 import jinja2
 
-from ....compiler.base import IntVarTensor
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CPUBackendSpec
+from aitemplate.backend.backend_spec import CPUBackendSpec
+
+from aitemplate.compiler.base import IntVarTensor
 
 
 INT_VAR_FUNC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/embedding/__init__.py b/python/aitemplate/backend/cuda/embedding/__init__.py
index 3e3aab46b..bcc34df37 100644
--- a/python/aitemplate/backend/cuda/embedding/__init__.py
+++ b/python/aitemplate/backend/cuda/embedding/__init__.py
@@ -13,4 +13,4 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bert_embeddings import *
+from aitemplate.backend.cuda.embedding.bert_embeddings import *
diff --git a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
index e62826889..9ebae2334 100644
--- a/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/cuda/embedding/bert_embeddings.py
@@ -20,8 +20,8 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
index 3c3873c83..c721702b9 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/__init__.py
@@ -13,8 +13,9 @@
 #  limitations under the License.
 #
 
-from . import (
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
     bmm_rcr_softmax,
+    dual_bmm_rrr_div,
     dual_gemm_rcr_fast_gelu,
     dual_gemm_rcr_silu,
     gemm_rcr_bias_softmax,
@@ -23,8 +24,9 @@
 
 __all__ = [
     "bmm_rcr_softmax",
+    "dual_bmm_rrr_div",
+    "dual_gemm_rcr_fast_gelu",
+    "dual_gemm_rcr_silu",
     "gemm_rcr_bias_softmax",
     "gemm_rcr_softmax",
-    "dual_gemm_rcr_silu",
-    "dual_gemm_rcr_fast_gelu",
 ]
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
deleted file mode 100644
index af5753b3a..000000000
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_common_softmax.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Common functions and templates for bmm-family ops
-"""
-import jinja2
-
-from ...common import gemm_common
-from ..gemm_universal import common
-
-from . import common_softmax
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-FUNC_DECL_TEMPLATE = jinja2.Template(
-    """
-void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-{% if has_bias %}
-  cutlass::half_t*,
-{% endif %}
-  cutlass::half_t*,
-  cutlass::half_t*,
-  float*,
-  cutlass::half_t*,
-  uint8_t*,
-{% if support_split_k %}
-  int,
-{% endif %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-{% for idx in range(ndims) %}
-  int64_t*,
-{% endfor %}
-  cudaStream_t
-);
-"""
-)
-
-
-FUNC_CALL_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{func_name}}(
-{{indent}}    {{a_ptr}},
-{{indent}}    {{b_ptr}},
-{% if has_bias %}
-{{indent}}    {{bias_ptr}},
-{% endif %}
-{{indent}}    {{c_ptr}},
-{{indent}}    {{d_ptr}},
-{{indent}}    {{n_ptr}},
-{{indent}}    {{soft_ptr}},
-{{indent}}    global_workspace_,
-{{indent}}    {{a_dim0_ptr}},
-{{indent}}    {{a_dim1_ptr}},
-{{indent}}    {{a_dim2_ptr}},
-{{indent}}    {{b_dim0_ptr}},
-{{indent}}    {{b_dim1_ptr}},
-{{indent}}    {{b_dim2_ptr}},
-{{indent}}    {{c_dim0_ptr}},
-{{indent}}    {{c_dim1_ptr}},
-{{indent}}    {{c_dim2_ptr}},
-{{indent}}    stream
-{{indent}});
-"""
-)
-
-TENSOR_DECL_TEMPLATE = jinja2.Template(
-    """
-  // cast to int64_t to avoid overflow
-  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1) * static_cast<int64_t>(a_dim2);
-  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1) * static_cast<int64_t>(b_dim2);
-  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1) * static_cast<int64_t>(c_dim2);
-  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
-
-
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
-  memory_pool->AllocateFloatTensor(c_dim0 * c_dim1,  mem_pool_sz);  // n_ptr: index 4
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
-"""
-)
-
-
-def gen_profiler(
-    func_attrs,
-    workdir,
-    dim_info_dict,
-    src_template,
-    problem_args_template,
-    args_parser_template,
-    emit_kernel=False,
-    bias_ptr_arg=None,
-):
-    """Generate code for profiling"""
-    op_type = func_attrs["op"]
-    op_instance = func_attrs["op_instance"]
-    has_d = False
-    if "has_d" in func_attrs:
-        has_d = func_attrs["has_d"]
-    shape_func = gemm_common.gen_shape_eval_code(
-        indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
-    )
-
-    file_pairs = []
-    has_bias = bias_ptr_arg is not None
-    assert not (has_d and has_bias)
-    for op_name, op in op_instance.items():
-        config = common_softmax.emit_instance(op, emit_kernel=emit_kernel)
-        config_name = common.extract_config_name(config)
-        name = "GemmInstance"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
-        )
-        exec_program = common_softmax.EXEC_TEMPLATE.render(
-            indent="  ",
-            instance=name,
-            is_profiler=True,
-            problem_args=problem_args_template.render(),
-        )
-        op_func = src_template.render(
-            custom_libs=common_softmax.gen_custom_libs(),
-            instances=instance,
-            function_name="bmm",
-            input_ndims=3,
-            weight_ndims=3,
-            shape_eval=shape_func,
-            exec_paths=exec_program,
-            has_d=has_d,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="bmm",
-            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
-            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
-            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
-            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
-            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
-            has_d=has_d,
-            a_dim0_ptr="&a_dim0",
-            a_dim1_ptr="&a_dim1",
-            a_dim2_ptr="&a_dim2",
-            b_dim0_ptr="&b_dim0",
-            b_dim1_ptr="&b_dim1",
-            b_dim2_ptr="&b_dim2",
-            c_dim0_ptr="&c_dim0",
-            c_dim1_ptr="&c_dim1",
-            c_dim2_ptr="&c_dim2",
-        )
-        code = common_softmax.PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser_template.render(),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(
-                name=name, has_d=has_d, has_bias=has_bias
-            ),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
-    return common.build_profiler(file_pairs)
-
-
-def gen_function_decl(func_attrs):
-    """Rendering argument to function declaration template"""
-    func_name = func_attrs["name"]
-    has_d = False
-    if "has_d" in func_attrs:
-        has_d = func_attrs["has_d"]
-    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3, has_d=has_d)
-
-
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-    problem_args,
-):
-    """Generate the code for main function"""
-    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
-    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
-    return common_softmax.gen_function(
-        func_attrs,
-        common_softmax.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims=input_ndims,
-        weight_ndims=weight_ndims,
-        dim_info_dict=dim_info_dict,
-        emit_kernel=True,
-    )
-
-
-def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
-    """Rendering the code to function call template"""
-
-    a = func_attrs["inputs"][0]
-    ashape = func_attrs["input_accessors"][0].original_shapes
-    b = func_attrs["inputs"][1]
-    bshape = func_attrs["input_accessors"][1].original_shapes
-
-    c = func_attrs["inputs"][2]
-    d = func_attrs["inputs"][3]
-    n = func_attrs["inputs"][4]
-
-    soft = func_attrs["outputs"][0]
-    cshape = func_attrs["output_accessors"][0].original_shapes
-    has_d = False
-    has_bias = bias_ptr_arg is not None
-    assert not (has_d and has_bias)
-    return FUNC_CALL_TEMPLATE.render(
-        func_name=func_attrs["name"],
-        a_ptr=a._attrs["name"],
-        b_ptr=b._attrs["name"],
-        has_bias=has_bias,
-        bias_ptr=bias_ptr_arg,
-        c_ptr=c._attrs["name"],
-        d_ptr=d._attrs["name"],
-        n_ptr=n._attrs["name"],
-        soft_ptr=soft._attrs["name"],
-        has_d=has_d,
-        a_dim0_ptr="&" + ashape[0]._attrs["name"],
-        a_dim1_ptr="&" + ashape[1]._attrs["name"],
-        a_dim2_ptr="&" + ashape[2]._attrs["name"],
-        b_dim0_ptr="&" + bshape[0]._attrs["name"],
-        b_dim1_ptr="&" + bshape[1]._attrs["name"],
-        b_dim2_ptr="&" + bshape[2]._attrs["name"],
-        c_dim0_ptr="&" + cshape[0]._attrs["name"],
-        c_dim1_ptr="&" + cshape[1]._attrs["name"],
-        c_dim2_ptr="&" + cshape[2]._attrs["name"],
-        indent=indent,
-    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
index 4a4b745a9..570986ce6 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -20,10 +20,13 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from ..gemm_universal.layout import RCR
-from . import bmm_common_softmax as bmm_common, common_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
+    common_softmax,
+    gemm_rcr_softmax,
+)
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -50,65 +53,61 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: B*M*K (RowMajor)
-        B: B*N*K (ColumnMajor)
-        C/D/sofmax: B*M*N (RowMajor)
-        N: B*M*1 (RowMajor)
+        A: (B, M, K) (RowMajor)
+        B: (B, N, K) (ColumnMajor)
+        C, D, Soft: (B, M, N) (RowMajor)
+        N, S: (B, block_num, M) (RowMajor)
     */
 
-        {M, N, K},
-        B,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, LayoutC(N)},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(0.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)},
-        M*K,
-        N*K,
-        M*N,
-        M*N,
-        M*N,
-        M*N
-
-
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                                     // cutlass::gemm::GemmCoord problem_size
+    B,                                                                                                                                     // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                           // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                           // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace), LayoutC(N)},                                                                      // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + B * M * N * sizeof({{elem_output_type}})), LayoutC(N)},                           // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(0.0)
+    },                                                                                                                                     // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * B * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                      // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * B * M * N * sizeof({{elem_output_type}}) + B * M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                           // TensorRefSoft ref_Softmax_
+    M * K,                                                                                                                                 // int64_t batch_stride_A_
+    N * K,                                                                                                                                 // int64_t batch_stride_B_
+    M * N,                                                                                                                                 // int64_t batch_stride_C_
+    M * N,                                                                                                                                 // int64_t batch_stride_D_
+    M * block_num,                                                                                                                         // int64_t batch_stride_Max_
+    M * block_num,                                                                                                                         // int64_t batch_stride_Sum_
+    M * N                                                                                                                                  // int64_t batch_stride_Softmax_
 """
 )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.config")
 def bmm_rcr_softmax_config(func_attrs, dtype="float16"):
-    """This function sets a callback for processing the epilogue of the kernel
-    associated with func_attrs.
-
-    Parameters
-    ----------
-    func_attrs: Dictionary
-        kernel attributes dictionary
-    layout: layout object
-        kernel layout
-    Returns
-    -------
-    None
-    """
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 @registry.reg("cuda.bmm_rcr_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
-    """Generate code for profiling"""
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
-        ARGS_PARSER_TEMPLATE,
-        emit_kernel=True,
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    return gemm_rcr_softmax.common_gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        ndims=3,
     )
 
 
@@ -118,26 +117,27 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    """Generate the code for main function"""
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE.render(),
+    return gemm_rcr_softmax.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
     )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.func_decl")
 def gen_function_decl(func_attrs):
-    """Rendering argument to function declaration template"""
-    func_name = func_attrs["name"]
-    return bmm_common.FUNC_DECL_TEMPLATE.render(func_name=func_name, ndims=3)
+    return gemm_rcr_softmax.gen_function_decl(
+        func_attrs=func_attrs,
+    )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.func_call")
 def gen_function_call(func_attrs, indent="  "):
-    """Rendering the code to function call template"""
-    return bmm_common.gen_function_call(func_attrs, indent)
+    return gemm_rcr_softmax.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
 
 
 @registry.reg("cuda.bmm_rcr_softmax.filter")
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
index 820cbde0d..72f7e86e1 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_dual_gemm.py
@@ -25,27 +25,15 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
-# pylint: disable=C0301,C0415,R1705
-
-EXTRA_CODE = jinja2.Template(
-    """
-#include "device/dual_gemm.h"
-#include "thread/left_silu_and_mul.h"
-
-typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
-decltype(nullptr_ref) ref_B0, ref_B1;
+from aitemplate.utils import alignment
 
-using LayoutA = cutlass::layout::RowMajor;
-using LayoutB = cutlass::layout::ColumnMajor;
-using LayoutC = cutlass::layout::RowMajor;
 
-"""
-)
+# pylint: disable=C0301,C0415,R1705
 
 # HACK: we don't record different permutation shape,
 # because it has little impact on execution time compared.
@@ -60,22 +48,41 @@
 
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
-  int64_t a_ptr_sz = a_dim0 * a_dim1;
-  int64_t b_ptr_sz = b_dim0 * b_dim1;
-  int64_t c_ptr_sz = c_dim0 * c_dim1;
+  int64_t a_ptr_sz = 1;
+{% for dim in adims %}
+  a_ptr_sz *= {{dim}};
+{% endfor %}
+
+  int64_t b0_ptr_sz = 1;
+{% for dim in bdims %}
+  b0_ptr_sz *= {{dim}};
+{% endfor %}
+
+  int64_t b1_ptr_sz = b0_ptr_sz;
+{% if broadcast_b1 %}
+  // scale b1_ptr_sz down by the broadcasted dim
+  b1_ptr_sz /= {{ bdims[broadcasted_bdim_id] }};
+{% endif %}
+
+  int64_t c_ptr_sz = 1;
+{% for dim in cdims %}
+  c_ptr_sz *= {{dim}};
+{% endfor %}
 
   // The value 1 is used to force ptr_max_sz to be non-zero
-  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b0_ptr_sz, c_ptr_sz});
+  size_t one_copy_sz = a_ptr_sz + b0_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += b1_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(b0_ptr_sz, mem_pool_sz);  // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 
 {% if has_bias %}
-  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 3
+  memory_pool->AllocateTensor(b1_ptr_sz, mem_pool_sz);  // b_ptr: index 3
 {% endif %}
 
 """
@@ -87,13 +94,13 @@
 //{{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
 {{indent}}using ElementCompute = typename {{instance}}::DualGemmKernel::Epilogue0::OutputOp::ElementCompute;
 
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}
 
 {{indent}}};
 {% if is_profiler %}
-{{indent}}// https://youtu.be/-Rp7UPbhErE
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 
@@ -159,40 +166,60 @@ def emit_instance(
     f_instance_convertor=dual_gemm_instance,
     emit_kernel=False,
     func_attrs=None,
+    broadcast_b1=False,
 ):
     import cutlass_lib
 
     emiter = cutlass_lib.gemm_operation.EmitDualGemmInstance()
-    op_def = emiter.emit(op)
+    op_def = emiter.emit(op, broadcast_b1=broadcast_b1)
     op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
     return op_def
 
 
-def default_fproc_f16(
+def default_fproc(
     *,
     op,
     a_layout,
     b_layout,
     c_layout,
-    epiligue_name,
-    epiligue2_name,
+    epilogue_name,
+    epilogue2_name,
     permute_layout=None,
+    dtype="float16",
 ):
     import copy
 
     import cutlass_lib
 
+    backend_spec = CUDASpec()
+    data_type = backend_spec.dtype_to_lib_type(dtype)
+
     ret = []
-    data_type = cutlass_lib.library.DataType.f16
+    # skip simt kernels
+    if (
+        op.tile_description.math_instruction.opcode_class
+        == cutlass_lib.library.OpcodeClass.Simt
+    ):
+        return ret
+
+    if data_type == "float":
+        if (
+            op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.f32
+            and op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.tf32
+        ):
+            return ret
     acc_type = cutlass_lib.library.DataType.f32
     # check target use fp16 acc
-    if "use_fp16_acc" in Target.current()._kwargs:
+    if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
         if Target.current()._kwargs["use_fp16_acc"]:
             acc_type = cutlass_lib.library.DataType.f16
+
     if (
-        op.A.element == data_type
-        and op.B.element == data_type
-        and op.C.element == data_type
+        cutlass_lib.library.DataTypeTag[op.A.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.C.element] == data_type
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
@@ -201,39 +228,45 @@ def default_fproc_f16(
         # set output major
         op.C.layout = c_layout
         # set epilogue
-        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
-        op.epilogue_functor2 = cutlass_lib.library.EpilogueFunctorName[epiligue2_name]
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
+        op.epilogue_functor2 = cutlass_lib.library.EpilogueFunctorName[epilogue2_name]
         op.element_epilogue = acc_type
         if permute_layout is not None:
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
                 permute_layout
             ]
         # set C alignment
-        for i in [8, 4, 2, 1]:
+        alignments = alignment.get_alignments(dtype)
+        for i in alignments:
             op = copy.deepcopy(op)
             op.C.alignment = i
             ret.append(op)
     return ret
 
 
-def make_fproc_f16(func_attrs, layout):
+def make_fproc(
+    func_attrs,
+    layout,
+    dtype="float16",
+):
     """
     This function sets a callback for processing the epilogue of the kernel
     associated with func_attrs.
     """
 
-    def fproc_f16(op):
+    def fproc(op):
         a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
-        return default_fproc_f16(
+        return default_fproc(
             op=op,
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
-            epiligue_name=func_attrs["epilogue"],
-            epiligue2_name=func_attrs["epilogue2"],
+            epilogue_name=func_attrs["epilogue"],
+            epilogue2_name=func_attrs["epilogue2"],
+            dtype=dtype,
         )
 
-    func_attrs["op_instance"] = extract_config(fproc_f16, func_attrs)
+    func_attrs["op_instance"] = extract_config(fproc, func_attrs)
 
 
 def gen_function(
@@ -251,6 +284,7 @@ def gen_function(
     input_addr_calculator="",
     output_addr_calculator="",
     extra_code="",
+    broadcast_b1=False,
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
@@ -268,6 +302,7 @@ def gen_function(
                 f_instance_convertor=f_instance_convertor,
                 emit_kernel=emit_kernel,
                 func_attrs=func_attrs,
+                broadcast_b1=broadcast_b1,
             )
             inst_def_flag.add(algo)
         else:
@@ -296,10 +331,16 @@ def gen_function(
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
     )
+
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return src_template.render(
         instances=instance_decl,
         function_name=func_name,
-        dtype="cutlass::half_t",
+        dtype=elem_input_type,
         shape_eval=shape_eval_func,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=output_addr_calculator,
@@ -328,6 +369,9 @@ def gen_profiler(
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
+    broadcasted_bdim_id=0,
+    ndims=2,
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -342,7 +386,6 @@ def gen_profiler(
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
 
-    ndims = 2
     adims = ["&a_dim" + str(i) for i in range(ndims)]
     bdims = ["&b_dim" + str(i) for i in range(ndims)]
     cdims = ["&c_dim" + str(i) for i in range(ndims)]
@@ -360,6 +403,7 @@ def gen_profiler(
         problem_args=problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         ),
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
@@ -373,7 +417,11 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(
-            op, for_profiler=True, emit_kernel=emit_kernel, func_attrs=func_attrs
+            op,
+            for_profiler=True,
+            emit_kernel=emit_kernel,
+            func_attrs=func_attrs,
+            broadcast_b1=broadcast_b1,
         )
         config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
@@ -387,11 +435,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -420,11 +463,11 @@ def gen_profiler(
     func_call = common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
@@ -446,7 +489,14 @@ def gen_profiler(
         weight_ndims=ndims,
         output_ndims=ndims,
         func_call=func_call,
-        tensor_decl=TENSOR_DECL_TEMPLATE.render(has_bias=has_bias),
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(
+            has_bias=has_bias,
+            adims=benchmark_adims,
+            bdims=benchmark_bdims,
+            cdims=benchmark_cdims,
+            broadcast_b1=broadcast_b1,
+            broadcasted_bdim_id=broadcasted_bdim_id,
+        ),
         benchmark_instances="\n".join(benchmark_instances),
         elem_type=elem_type,
     )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
index 5f172d6ea..0b3db0496 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/common_softmax.py
@@ -21,9 +21,10 @@
 
 import jinja2
 
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301,C0415,R1705
 
@@ -34,6 +35,7 @@
 #include <memory>
 #include <random>
 #include <vector>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/kernel/gemm_grouped.h"
@@ -67,21 +69,20 @@
 using LayoutB = cutlass::layout::ColumnMajor;
 using LayoutC = cutlass::layout::RowMajor;
 
-
-void {{function_name}} (
-    cutlass::half_t* a_ptr,
-    cutlass::half_t* b_ptr,
-{% if has_d %}
-    cutlass::half_t* d_ptr,
+{% if is_profiler %}
+template <typename {{instance_name_base}}>
+void {{func_name}} (
+    {{instance_name_base}}& gemm_op,
+{% else %}
+void {{func_name}} (
 {% endif %}
-    cutlass::half_t* c_ptr,
-    cutlass::half_t* d_ptr,
-    float* n_ptr,
-    cutlass::half_t* soft_ptr,
-    uint8_t* workspace,
-{% if support_split_k %}
-    int split_k,
+    void* a_ptr,
+    void* b_ptr,
+{% if has_bias %}
+    void* bias_ptr,
 {% endif %}
+    void* soft_ptr,
+    uint8_t* workspace,
 {% for idx in range(input_ndims) %}
     int64_t* a_dim{{idx}},
 {% endfor %}
@@ -94,10 +95,13 @@
     cudaStream_t stream
   ) {
   {{shape_eval}}
+
   {{output_addr_calculator}}
+
   {{extra_shape}}
 
   {{exec_paths}}
+
   throw std::runtime_error(
       "Unsupported workload for this gemm specialization."
   );
@@ -110,41 +114,41 @@
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}int block_num = (N + {{instance}}::ThreadblockShape::kN - 1) / {{instance}}::ThreadblockShape::kN;
 
-{{problem_args}}
-
-{{indent}}};
-{{indent}}{{instance}} gemm_op;
 {% if is_profiler %}
-{{indent}}size_t workspace_size = 0; //gemm_op.get_workspace_size(arguments);
+{{indent}}size_t workspace_size = 2 * M * N * sizeof({{elem_output_type}}) + 2 * block_num * M * sizeof(float);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
 {{indent}}GLOBAL_WORKSPACE_SIZE = workspace_size;
+{% else %}
+{{indent}}{{instance}} gemm_op;
 {% endif %}
 
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
+{{indent}}typename {{instance}}::Arguments arguments{
+{{problem_args}}
+{{indent}}};
+
 {{indent}}auto status = gemm_op.initialize(arguments);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}status = gemm_op(stream);
 {{indent}}CUTLASS_CHECK(status);
 {{indent}}return;
-
 """
 )
 
+
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
 void {{func_name}}(
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  cutlass::half_t*,
-  float*,
-  cutlass::half_t*,
-  uint8_t*,
-{% if support_split_k %}
-  int,
+  void*,
+  void*,
+{% if has_bias %}
+  void*,
 {% endif %}
+  void*,
+  uint8_t*,
 {% for idx in range(input_ndims) %}
   int64_t*,
 {% endfor %}
@@ -163,17 +167,16 @@
 FUNC_CALL_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{func_name}}(
+{% if is_profiler %}
+{{indent}}    gemm_op,
+{% endif %}
 {{indent}}    {{a_ptr}},
 {{indent}}    {{b_ptr}},
 {% if has_bias %}
 {{indent}}    {{bias_ptr}},
 {% endif %}
-{{indent}}    {{c_ptr}},
-{{indent}}    {{d_ptr}},
-{{indent}}    {{n_ptr}},
 {{indent}}    {{soft_ptr}},
 {{indent}}    global_workspace_,
-{{indent}}    {{split_k}},
 {% for dim in adims %}
 {{indent}}    {{dim}},
 {% endfor %}
@@ -189,23 +192,62 @@
 )
 
 
+BENCHMARK_INSTANCE_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}{{instance_name}} {{gemm_op}};
+{{indent}}const char *gemm_op_name = "{{gemm_op_name}}";
+{{indent}}int ret = 0;
+{{indent}}try {
+{{indent}}ret = {{func_name}}(
+{{indent}}    {{gemm_op}},
+{{indent}}    gemm_op_name,
+{{indent}}    memory_pool.get(),
+{{indent}}    global_workspace_,
+{% for dim in adims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in bdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{% for dim in cdims %}
+{{indent}}    {{dim}},
+{% endfor %}
+{{indent}}    stream
+{{indent}});
+{{indent}}} catch (...) {}
+{{indent}}if (ret != 0)
+{{indent}}  return ret;
+{{indent}}
+{{indent}}}
+"""
+)
+
+
 TENSOR_DECL_TEMPLATE = jinja2.Template(
     """
-  // cast to int64_t to avoid overflow
-  int64_t a_ptr_sz = static_cast<int64_t>(a_dim0) * static_cast<int64_t>(a_dim1);
-  int64_t b_ptr_sz = static_cast<int64_t>(b_dim0) * static_cast<int64_t>(b_dim1);
-  int64_t c_ptr_sz = static_cast<int64_t>(c_dim0) * static_cast<int64_t>(c_dim1);
-  int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
-
-  memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
-  memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // d_ptr: index 3
-  memory_pool->AllocateFloatTensor(c_dim0,  mem_pool_sz);  // n_ptr: index 4
-  memory_pool->AllocateHalfTensor(c_ptr_sz, mem_pool_sz);  // soft_ptr: index 5
+  int64_t a_ptr_sz = a_dim0 * a_dim1;
+  int64_t b_ptr_sz = b_dim0 * b_dim1;
+  int64_t c_ptr_sz = c_dim0 * c_dim1;
+
+  // The value 1 is used to force ptr_max_sz to be non-zero
+  int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
+
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim1;
+{%endif%}
+
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
+
+  memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);                      // a_ptr: index 0
+  memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);                      // b_ptr: index 1
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/ true);  // soft_ptr: index 2
+
+{% if has_bias %}
+  memory_pool->AllocateTensor(c_dim1, mem_pool_sz);                        // bias_ptr: index 3
+{% endif %}
 """
 )
 
@@ -224,10 +266,64 @@
     """
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
+#include <sstream>
+
 {{op_func}}
 
+template <typename DType>
+struct ProfilerMemoryPool;
+
+template <typename GemmInstance>
+int benchmark_{{func_name}} (
+    GemmInstance &gemm_op,
+    const char *gemm_op_name,
+    ProfilerMemoryPool<{{elem_type}}>* memory_pool,
+    uint8_t* global_workspace_,
+{% for idx in range(input_ndims) %}
+    int64_t* a_dim{{idx}},
+{% endfor %}
+{% for idx in range(weight_ndims) %}
+    int64_t* b_dim{{idx}},
+{% endfor %}
+{% for idx in range(input_ndims) %}
+    int64_t* c_dim{{idx}},
+{% endfor %}
+    cudaStream_t stream
+  ) {
+  // warmup
+  for (int i = 0; i < 5; ++i) {
+    {{func_call}}
+  }
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0], stream);
+  for (int i = 0; i < 10; ++i) {
+    {{func_call}}
+  }
+  cudaEventRecord(events[1], stream);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  // TODO: output workspace
+  if (runtime_ms < 0.00001) {
+      throw std::runtime_error(
+      "OOB in cutlass."
+    );
+  }
+  std::cout << "OP:" << gemm_op_name << ",";
+  std::cout << "TIME:" << runtime_ms << ",";
+  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  return 0;
+}
+
+template <typename DType>
 struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
+  ProfilerMemoryPool() : shared_input_tensor(false) {
     std::random_device rd;
     gen = std::mt19937(rd());
     uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
@@ -239,7 +335,49 @@
   }
   ~ProfilerMemoryPool() {}
 
-  template <typename DType>
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz, size_t l2_cache_bytes) {
+    int times_covers_l2_cache = (int)std::ceil(l2_cache_bytes / sizeof(DType) / ptr_max_sz);
+    int64_t mem_pool_sz = std::max(2, std::min(512, times_covers_l2_cache));
+    size_t free_global_mem = 0;
+    size_t total_global_mem = 0;
+    cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
+    if (cuda_error != cudaSuccess) {
+      auto error_msg = std::string("Failed to invoke cudaMemGetInfo: ") +
+          cudaGetErrorName(cuda_error) + ", at " + __FILE__;
+      throw std::runtime_error(error_msg);
+    }
+    size_t single_copy_nbytes = one_copy_sz * sizeof(DType);
+    while (mem_pool_sz > 0) {
+      size_t nbytes = single_copy_nbytes * mem_pool_sz;
+      if (nbytes < free_global_mem) {
+        break;
+      }
+      mem_pool_sz--;
+    }
+
+    if (mem_pool_sz <= 1) {
+      size_t minimal_required_nbytes = ptr_max_sz * sizeof(DType);
+      if (minimal_required_nbytes > free_global_mem) {
+        // We absolutely run out of memory
+        auto error_msg = std::string("no enough GPU memory: requested ") +
+            std::to_string(minimal_required_nbytes) + ", available: " +
+            std::to_string(free_global_mem) + ", ptr_max_sz: " +
+            std::to_string(ptr_max_sz) + ", at " + __FILE__;
+        throw std::runtime_error(error_msg);
+      } else {
+        // Let's try to allocate a single blob that is large enough to hold
+        // all input tensors. Note that this is still an approximation, because
+        // we may still hit cudaErrorMemoryAllocation error while allocating
+        // memory for the output. We will rely on cudaMalloc to throw out
+        // an exception in such a case.
+        shared_input_tensor = true;
+        AllocateGaussianTensor(ptr_max_sz);
+      }
+      return 1;
+    }
+    return mem_pool_sz;
+  }
+
   DType* AllocateGaussianTensor(int64_t size) {
     size_t length = size * sizeof(DType);
     blobs.emplace_back(length);
@@ -255,41 +393,25 @@
     return ptr;
   }
 
-
-  cutlass::half_t* AllocateHalfGaussianTensor(int64_t size) {
-    return reinterpret_cast<cutlass::half_t*>(
-        AllocateGaussianTensor<__half>(size));
-  }
-
-  int AllocateHalfTensor(int64_t size, int64_t copy) {
+  int AllocateTensor(int64_t size, int64_t copy, bool is_output = false) {
     offsets.push_back(0);
     strides.push_back(size);
     copies.push_back(copy);
-    auto ptr = AllocateHalfGaussianTensor(size * copy);
-    ptrs.push_back(reinterpret_cast<void*>(ptr));
-    return ptrs.size() - 1;
-  }
-
-  float* AllocateFloatGaussianTensor(int64_t size) {
-    return reinterpret_cast<float*>(
-        AllocateGaussianTensor<float>(size));
-  }
-
-  int AllocateFloatTensor(int64_t size, int64_t copy) {
-    offsets.push_back(0);
-    strides.push_back(size);
-    copies.push_back(copy);
-    auto ptr = AllocateFloatGaussianTensor(size * copy);
+    DType *ptr;
+    if (!is_output && shared_input_tensor) {
+      ptr = reinterpret_cast<DType*>(blobs.back().get());
+    } else {
+      ptr = AllocateGaussianTensor(size * copy);
+    }
     ptrs.push_back(reinterpret_cast<void*>(ptr));
     return ptrs.size() - 1;
   }
 
-  template <typename T>
-  T* RequestTensorByIdx(int idx) {
+  DType* RequestTensorByIdx(int idx) {
     auto copy = copies.at(idx);
     auto offset = offsets.at(idx);
     auto stride = strides.at(idx);
-    T* ptr = reinterpret_cast<T*>(ptrs.at(idx));
+    DType* ptr = reinterpret_cast<DType*>(ptrs.at(idx));
     ptr += offset;
     offset += stride;
     if (offset == copy * stride) {
@@ -306,61 +428,44 @@
   std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
   std::mt19937 gen;
   std::uniform_int_distribution<int64_t> uniform_dist;
+  // make a shared blob to hold all inputs in cases we do not have
+  // enough GPU memory
+  bool shared_input_tensor;
 };
 
 int main(int argc, char** argv) {
   int device_idx;
   cudaDeviceProp device_properties;
   cudaError_t result = cudaGetDevice(&device_idx);
-  auto memory_pool = std::make_unique<ProfilerMemoryPool>();
+  auto memory_pool = std::make_unique<ProfilerMemoryPool<{{elem_type}}>>();
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDevice() API call failed.");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDevice() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   result = cudaGetDeviceProperties(&device_properties, device_idx);
 
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDeviceProperties() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
-
-
   {{args_parse}}
 
-  using ElementOutput = typename {{name}}::ElementC;
-  using ElementInputA = typename {{name}}::ElementA;
-  using ElementInputB = typename {{name}}::ElementB;
-  using ElementInputN = typename {{name}}::ElementN;
-  uint8_t* global_workspace = nullptr;
+  uint8_t* global_workspace_ = nullptr;
   cudaStream_t stream = nullptr;
 
   {{tensor_decl}}
 
-  // warmup
-  {{func_call}}
-  cudaEvent_t events[2];
-  for (auto & event : events) {
-    cudaEventCreate(&event);
-  }
-  cudaEventRecord(events[0], stream);
-  for (int i = 0; i < 5; ++i) {
-    {{func_call}}
-  }
-  cudaEventRecord(events[1], stream);
-  cudaEventSynchronize(events[1]);
-  float runtime_ms = 0;
-  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
-  for (auto event : events) {
-    (void)cudaEventDestroy(event);
-  }
-  // TODO: output workspace
-  if (runtime_ms < 0.00001) {
-      throw std::runtime_error(
-      "OOB in cutlass."
-    );
-  }
-  std::cout << "TIME:" << runtime_ms << std::endl;
-  std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
+  {{benchmark_instances}}
+
+  return 0;
 }
 """
 )
@@ -403,9 +508,9 @@ def gen_function(
     input_ndims,
     weight_ndims,
     dim_info_dict,
+    has_bias=False,
     f_instance_convertor=_gemm_softmax_instance,
     emit_kernel=False,
-    support_split_k=False,
     output_addr_calculator="",
     extra_code="",
 ):
@@ -438,44 +543,51 @@ def gen_function(
             indent="    ",
             instance=fname,
             problem_args=problem_args,
-            support_split_k=support_split_k,
         )
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         custom_libs=gen_custom_libs(),
         instances=instance_decl,
-        function_name=func_name,
-        dtype="cutlass::half_t",
+        func_name=func_name,
         shape_eval=shape_eval_func,
         output_addr_calculator=output_addr_calculator,
         exec_paths=exec_paths,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
-        support_split_k=support_split_k,
-        has_d=common.has_d(func_attrs),
-        has_d1=common.has_d1(func_attrs),
         extra_code=extra_code,
+        has_bias=has_bias,
     )
 
 
 def gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
     args_parser_template,
     emit_kernel=False,
-    support_split_k=False,
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    ndims=2,
 ):
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
 
-    ndims = 2
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    elem_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     adims = ["&a_dim" + str(i) for i in range(ndims)]
     bdims = ["&b_dim" + str(i) for i in range(ndims)]
     cdims = ["&c_dim" + str(i) for i in range(ndims)]
@@ -483,56 +595,89 @@ def gen_profiler(
         indent=2, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
-    file_pairs = []
     has_bias = bias_ptr_arg is not None
-    for op_name, op in op_instance.items():
+    instance_name_base = "GemmSoftmaxInstance"
+    exec_program = EXEC_TEMPLATE.render(
+        indent="  ",
+        instance=instance_name_base,
+        is_profiler=True,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+        elem_output_type=elem_output_type,
+    )
+
+    instances = []
+    benchmark_instances = []
+    func_name = "gemm_softmax"
+    for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(op, emit_kernel=emit_kernel)
         config_name = common.extract_config_name(config)
-        name = "GemmInstance"
+        instance_name = f"{instance_name_base}_{instance_idx}"
+        gemm_op = f"gemm_softmax_op_{instance_idx}"
         instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=name, config=config
+            config_name=config_name, name=instance_name, config=config
         )
-        exec_program = EXEC_TEMPLATE.render(
+        benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
-            instance=name,
-            is_profiler=True,
-            support_split_k=support_split_k,
-            problem_args=problem_args_template.render(),
-        )
-        op_func = src_template.render(
-            custom_libs=gen_custom_libs(),
-            instances=instance,
-            function_name="gemm",
-            input_ndims=2,
-            weight_ndims=2,
-            shape_eval=shape_func,
-            exec_paths=exec_program,
-            output_addr_calculator=output_addr_calculator,
-            support_split_k=support_split_k,
-            extra_code=extra_code,
-        )
-        func_call = FUNC_CALL_TEMPLATE.render(
-            func_name="gemm",
-            a_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(0)",
-            b_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(2)",
-            d_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(3)",
-            n_ptr="memory_pool->RequestTensorByIdx<float>(4)",
-            soft_ptr="memory_pool->RequestTensorByIdx<cutlass::half_t>(5)",
-            split_k="split_k",
+            instance_name=instance_name,
+            gemm_op=gemm_op,
+            gemm_op_name=op_name,
+            func_name=f"benchmark_{func_name}",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
         )
-        code = PROFILER_TEMPLATE.render(
-            op_func=op_func,
-            args_parse=args_parser_template.render(),
-            func_call=func_call,
-            name=name,
-            tensor_decl=TENSOR_DECL_TEMPLATE.render(name=name, has_bias=has_bias),
-        )
-        common.add_profiler(file_pairs, workdir, op_type, op_name, code)
-    # build
+        instances.append(instance)
+        benchmark_instances.append(benchmark_instance)
+
+    op_func = src_template.render(
+        is_profiler=True,
+        instances="\n".join(instances),
+        func_name=func_name,
+        instance_name_base=instance_name_base,
+        custom_libs=gen_custom_libs(),
+        has_bias=has_bias,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        shape_eval=shape_func,
+        exec_paths=exec_program,
+        output_addr_calculator=output_addr_calculator,
+        extra_code=extra_code,
+    )
+    benchmark_adims = ["a_dim" + str(i) for i in range(ndims)]
+    benchmark_bdims = ["b_dim" + str(i) for i in range(ndims)]
+    benchmark_cdims = ["c_dim" + str(i) for i in range(ndims)]
+    func_call = FUNC_CALL_TEMPLATE.render(
+        is_profiler=True,
+        func_name=func_name,
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
+        has_bias=has_bias,
+        bias_ptr=bias_ptr_arg,
+        soft_ptr="memory_pool->RequestTensorByIdx(2)",
+        adims=benchmark_adims,
+        bdims=benchmark_bdims,
+        cdims=benchmark_cdims,
+    )
+    code = PROFILER_TEMPLATE.render(
+        op_func=op_func,
+        has_bias=has_bias,
+        args_parse=args_parser_template.render(),
+        func_name=func_name,
+        input_ndims=ndims,
+        weight_ndims=ndims,
+        func_call=func_call,
+        name=instance_name_base,
+        tensor_decl=TENSOR_DECL_TEMPLATE.render(
+            has_bias=has_bias,
+        ),
+        benchmark_instances="\n".join(benchmark_instances),
+        elem_output_type=elem_output_type,
+        elem_type=elem_type,
+    )
+
+    file_pairs = []
+    common.add_profiler(file_pairs, workdir, op_type, profiler_filename, code)
     return common.build_profiler(file_pairs)
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
new file mode 100644
index 000000000..61b11bd5a
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -0,0 +1,350 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM Specialization for
+C = BMM_RRR(A, B0) / BMM_RRR(A, B1)
+where A[RowMajor][M, K], B[RowMajor][K, N], B1[RowMajor][K, N]
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::DualGemmMode::kBatched,         // DualGemmMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(N)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    nullptr_ref,                                   // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(N)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    nullptr_ref,                                   // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
+    B,                                             // int batch_count
+    M * K,                                         // int64_t batch_stride_A
+    K * N,                                         // int64_t batch_stride_B0
+{% if broadcast_b1 %}
+    K,                                             // int64_t batch_stride_B1
+{% else %}
+    K * N,                                         // int64_t batch_stride_B1
+{% endif %}
+    M * N,                                         // int64_t batch_stride_C
+    M * N,                                         // int64_t batch_stride_D
+"""
+)
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+    int64_t B = std::atoi(argv[1]);
+    int64_t M = std::atoi(argv[2]);
+    int64_t N = std::atoi(argv[3]);
+    int64_t K = std::atoi(argv[4]);
+
+    int64_t split_k = 1;  // present in the generated code, but not used
+
+    int64_t a_dim0 = B;
+    int64_t a_dim1 = M;
+    int64_t a_dim2 = K;
+
+    int64_t b_dim0 = B;
+    int64_t b_dim1 = K;
+    int64_t b_dim2 = N;
+
+    int64_t c_dim0 = B;
+    int64_t c_dim1 = M;
+    int64_t c_dim2 = N;
+"""
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/thread/linear_combination_params.h"
+
+#include "device/dual_gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation.
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class Div {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+
+  struct Params{};
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  Div(Params const &/*params*/) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    assert(false);
+  }
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &lhs,
+    FragmentAccumulator const &rhs) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_to_compute;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> compute_to_output;
+
+    ComputeFragment converted_lhs = accumulator_to_compute(lhs);
+    ComputeFragment converted_rhs = accumulator_to_compute(rhs);
+
+    cutlass::divides<ComputeFragment> div;
+    return compute_to_output(div(converted_lhs, converted_rhs));
+  }
+
+  CUTLASS_HOST_DEVICE
+  ElementOutput operator()(
+      ElementAccumulator const& lhs,
+      ElementAccumulator const& rhs
+  ) const {
+      ElementCompute converted_lhs(lhs);
+      ElementCompute converted_rhs(rhs);
+      cutlass::divides<ElementCompute> div;
+      return ElementOutput(div(converted_lhs, converted_rhs));
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::RowMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+"""
+)
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.config")
+def config(
+    func_attrs,
+    dtype="float16",
+):
+    common_dual_gemm.make_fproc(
+        func_attrs=func_attrs,
+        layout=RRR,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.gen_profiler")
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
+    return common_dual_gemm.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator="",
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
+        broadcasted_bdim_id=2,
+        ndims=3,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
+    if problem_args_template is None:
+        problem_args = PROBLEM_ARGS_TEMPLATE.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
+        )
+    else:
+        problem_args = problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
+        )
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+
+    return common_dual_gemm.gen_function(
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        emit_kernel=True,
+        support_split_k=True,
+        output_addr_calculator="",
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.func_call")
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
+    bias = func_attrs["inputs"][2]
+
+    return common.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+        bias_ptr_arg=bias._attrs["name"],
+    )
+
+
+@registry.reg("cuda.dual_bmm_rrr_div.filter")
+def function_filter(
+    cfg,
+    func_attrs,
+    ab_alignment,
+):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        ab_alignment=ab_alignment,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index b615589c2..283aaab72 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -15,15 +15,15 @@
 """
 GEMM Specialization for
 C = FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
-where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[ColMajor][N, K]
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common, common_bias
-from ..gemm_universal.layout import RCR
-from . import common_dual_gemm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -31,19 +31,28 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -67,19 +76,24 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -201,7 +215,7 @@ class LeftFastGeluAndMul {
 } // namespace cutlass
 
 
-typename cutlass::TensorRef<cutlass::half_t, cutlass::layout::RowMajor> nullptr_ref{};
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
 decltype(nullptr_ref) ref_B0, ref_B1;
 
 using LayoutA = cutlass::layout::RowMajor;
@@ -214,7 +228,11 @@ class LeftFastGeluAndMul {
 
 @registry.reg("cuda.dual_gemm_rcr_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+    common_dual_gemm.make_fproc(
+        func_attrs,
+        RCR,
+        dtype=dtype,
+    )
 
 
 def common_gen_profiler(
@@ -226,6 +244,7 @@ def common_gen_profiler(
     problem_args_template,
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
@@ -243,11 +262,19 @@ def common_gen_profiler(
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
         extra_code=extra_code,
+        broadcast_b1=broadcast_b1,
+        broadcasted_bdim_id=0,
+        ndims=2,
     )
 
 
 @registry.reg("cuda.dual_gemm_rcr_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -256,7 +283,10 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         common_bias.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
     )
 
 
@@ -275,15 +305,18 @@ def gen_function(
         func_attrs["outputs"][0]._attrs["dtype"]
     )
 
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
     if problem_args_template is None:
         problem_args = PROBLEM_ARGS_TEMPLATE.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     else:
         problem_args = problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
@@ -302,7 +335,10 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index e0418ece8..a2cd67f60 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -15,15 +15,15 @@
 """
 GEMM Specialization for
 C = SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
-where A[RowMajor][M, K], B[ColMajor][N, K], B1[RowMajor][N, K]
+where A[RowMajor][M, K], B[ColMajor][N, K], B1[ColMajor][N, K]
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common, common_bias
-from ..gemm_universal.layout import RCR
-from . import common_dual_gemm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_dual_gemm
+from aitemplate.backend.cuda.gemm_universal import common, common_bias
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -31,19 +31,28 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
 """
 )
 
@@ -67,26 +76,52 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmCoord{M, N, K},
-    {({{elem_input_type}}*)a_ptr, LayoutA(K)},
-    {({{elem_input_type}}*)b_ptr, LayoutB(K)},
-    ref_B0,
-    nullptr_ref, // D0
-    {({{elem_input_type}}*)bias_ptr, LayoutB(K)}, // B1
-    ref_B1,
-    nullptr_ref, // D1
-    {({{elem_output_type}}*)c_ptr, LayoutC(N)}, // D2
-    {ElementCompute(1), ElementCompute(0)},
-    {ElementCompute(1), ElementCompute(0)},
-    {},
-    1 // kSplitKSerial
+    cutlass::gemm::DualGemmMode::kGemm,            // DualGemmMode mode
+    cutlass::gemm::GemmCoord{M, N, K},             // GemmCoord problem_size_
+    {({{elem_input_type}}*)a_ptr, LayoutA(K)},     // TensorRef<ElementA const, LayoutA> ref_A0_
+    {({{elem_input_type}}*)b_ptr, LayoutB(K)},     // TensorRef<ElementB const, LayoutB0> ref_B0_
+    ref_B0,                                        // TensorRef<ElementC const, LayoutC> ref_C0_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D0_
+{% if broadcast_b1 %}
+    {({{elem_input_type}}*)bias_ptr, 0},           // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% else %}
+    {({{elem_input_type}}*)bias_ptr, LayoutB(K)},  // TensorRef<ElementB const, LayoutB1> ref_B1_
+{% endif %}
+    ref_B1,                                        // TensorRef<ElementC const, LayoutC> ref_C1_
+    nullptr_ref,                                   // TensorRef<ElementC, LayoutC> ref_D1_
+    {({{elem_output_type}}*)c_ptr, LayoutC(N)},    // TensorRef<ElementC, LayoutC> ref_D2_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp0::Params epilogue0_
+    {ElementCompute(1), ElementCompute(0)},        // typename EpilogueOutputOp1::Params epilogue1_
+    {},                                            // typename EpilogueOutputOp2::Params epilogue2_
+    1,                                             // int split_k_slices_
+"""
+)
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "device/dual_gemm.h"
+#include "thread/left_silu_and_mul.h"
+#include "dual_gemm_common.h"
+
+typename cutlass::TensorRef<{{dtype}}, cutlass::layout::RowMajor> nullptr_ref{};
+decltype(nullptr_ref) ref_B0, ref_B1;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
 """
 )
 
 
 @registry.reg("cuda.dual_gemm_rcr_silu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common_dual_gemm.make_fproc_f16(func_attrs, RCR)
+    common_dual_gemm.make_fproc(
+        func_attrs,
+        RCR,
+        dtype=dtype,
+    )
 
 
 def common_gen_profiler(
@@ -98,6 +133,7 @@ def common_gen_profiler(
     problem_args_template,
     bias_ptr_arg=None,
     extra_code="",
+    broadcast_b1=False,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
@@ -115,11 +151,19 @@ def common_gen_profiler(
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
         extra_code=extra_code,
+        broadcast_b1=broadcast_b1,
+        broadcasted_bdim_id=0,
+        ndims=2,
     )
 
 
 @registry.reg("cuda.dual_gemm_rcr_silu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -128,7 +172,10 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         common_bias.SRC_TEMPLATE,
         PROFILER_PROBLEM_ARGS_TEMPLATE,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=func_attrs.get("broadcast_b1", False),
     )
 
 
@@ -147,15 +194,18 @@ def gen_function(
         func_attrs["outputs"][0]._attrs["dtype"]
     )
 
+    broadcast_b1 = func_attrs.get("broadcast_b1", False)
     if problem_args_template is None:
         problem_args = PROBLEM_ARGS_TEMPLATE.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     else:
         problem_args = problem_args_template.render(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
+            broadcast_b1=broadcast_b1,
         )
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
@@ -174,7 +224,10 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=common_dual_gemm.EXTRA_CODE.render(),
+        extra_code=EXTRA_CODE.render(
+            dtype=elem_input_type,
+        ),
+        broadcast_b1=broadcast_b1,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 90e9d25a6..bf3a4d0c0 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -20,9 +20,12 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from . import common_softmax, gemm_rcr_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_epilogue_vistor import (
+    common_softmax,
+    gemm_rcr_softmax,
+)
+from aitemplate.backend.cuda.gemm_universal import common
 
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
@@ -31,42 +34,57 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: M*K (RowMajor)
-        B: N*K (ColumnMajor)
-        C/D/sofmax: M*N (RowMajor)
-        N: M*1 (RowMajor)
+        A: (M, K) (RowMajor)
+        B: (N, K) (ColumnMajor)
+        C, D, Soft: (M, N) (RowMajor)
+        N, S: (block_num, M) (RowMajor)
     */
 
-        {M, N, K},
-        1,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, 0},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(1.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)}
-
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                             // cutlass::gemm::GemmCoord problem_size
+    1,                                                                                                                             // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(bias_ptr), 0},                                                                        // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + M * N * sizeof({{elem_output_type}})), LayoutC(N)},                       // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(1.0)
+    },                                                                                                                             // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                  // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}}) + M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                   // TensorRefSoft ref_Softmax_
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.config")
 def gemm_rcr_bias_softmax_config(func_attrs, dtype="float16"):
-    return gemm_rcr_softmax.gemm_rcr_softmax_config(func_attrs, dtype)
+    gemm_rcr_softmax.gemm_rcr_softmax_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
     return gemm_rcr_softmax.common_gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=gemm_rcr_softmax.ARGS_PARSER_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
 
@@ -77,23 +95,31 @@ def gen_function(
     dim_info_dict,
 ):
     return gemm_rcr_softmax.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        has_bias=True,
     )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.func_decl")
 def gen_function_decl(func_attrs):
-    return gemm_rcr_softmax.gen_function_decl(func_attrs)
+    return gemm_rcr_softmax.gen_function_decl(
+        func_attrs=func_attrs,
+        has_bias=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_softmax.func_call")
 def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+
     return gemm_rcr_softmax.gen_function_call(
-        func_attrs,
-        indent,
+        func_attrs=func_attrs,
+        indent=indent,
+        has_bias=True,
+        bias_ptr=bias._attrs["name"],
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 45d69ac00..35ef2e467 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -20,10 +20,11 @@
 """
 import jinja2
 
-from ... import registry
-from ..gemm_universal import common
-from ..gemm_universal.layout import RCR
-from . import common_softmax
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_epilogue_vistor import common_softmax
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -33,7 +34,6 @@
   int64_t M = std::atoi(argv[1]);
   int64_t N = std::atoi(argv[2]);
   int64_t K = std::atoi(argv[3]);
-  int64_t split_k = std::atoi(argv[4]);
 
   int64_t a_dim0 = M;
   int64_t a_dim1 = K;
@@ -47,69 +47,81 @@
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
     /*
-        A: M*K (RowMajor)
-        B: N*K (ColumnMajor)
-        C/D/sofmax: M*N (RowMajor)
-        N: M*1 (RowMajor)
+        A: (M, K) (RowMajor)
+        B: (N, K) (ColumnMajor)
+        C, D, Soft: (M, N) (RowMajor)
+        N, S: (block_num, M) (RowMajor)
     */
 
-        {M, N, K},
-        1,
-        {a_ptr, LayoutA(K)},
-        {b_ptr, LayoutB(K)},
-        {c_ptr, LayoutC(N)},
-        {d_ptr, LayoutC(N)},
-        {
-            float(1.0),
-            float(0.0)
-        },
-        {n_ptr, LayoutC(1)},
-        {soft_ptr, LayoutC(N)}
-
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                                                             // cutlass::gemm::GemmCoord problem_size
+    1,                                                                                                                             // int32_t batch_count_
+    {reinterpret_cast<{{elem_input_type}}*>(a_ptr), LayoutA(K)},                                                                   // TensorRefA ref_A_
+    {reinterpret_cast<{{elem_input_type}}*>(b_ptr), LayoutB(K)},                                                                   // TensorRefB ref_B_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace), LayoutC(N)},                                                              // TensorRefC ref_C_
+    {reinterpret_cast<{{elem_output_type}}*>(workspace + M * N * sizeof({{elem_output_type}})), LayoutC(N)},                       // TensorRefC ref_D_
+    {
+        float(1.0),
+        float(0.0)
+    },                                                                                                                             // typename EpilogueFunctorOp::Params linear_scaling
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}})), LayoutC(1)},                                  // TensorRefN ref_N_
+    {reinterpret_cast<float*>(workspace + 2 * M * N * sizeof({{elem_output_type}}) + M * block_num * sizeof(float)), LayoutC(1)},  // TensorRefSum ref_S_
+    {reinterpret_cast<{{elem_output_type}}*>(soft_ptr) + output_offset, LayoutC(output_stride)},                                   // TensorRefSoft ref_Softmax_
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.config")
 def gemm_rcr_softmax_config(func_attrs, dtype="float16"):
-    common.make_fproc_f16(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR)
 
 
 def common_gen_profiler(
     func_attrs,
     workdir,
+    profiler_filename,
     dim_info_dict,
     src_template,
     problem_args_template,
-    bias_ptr_arg=None,
-    extra_code="",
+    args_parser_template,
+    **kwargs,
 ):
     output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
         stride_dim="*b_dim0"
     )
+
     return common_softmax.gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        src_template,
-        problem_args_template,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        args_parser_template=args_parser_template,
         emit_kernel=True,
-        support_split_k=True,
         output_addr_calculator=output_addr_calculator,
-        bias_ptr_arg=bias_ptr_arg,
-        extra_code=extra_code,
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.gen_profiler")
-def gen_profiler(func_attrs, workdir, dim_info_dict):
+def gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
     return common_gen_profiler(
-        func_attrs,
-        workdir,
-        dim_info_dict,
-        common_softmax.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_softmax.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
     )
 
 
@@ -119,53 +131,68 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
     problem_args_template=None,
+    **kwargs,
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
     if problem_args_template is None:
-        problem_args = PROBLEM_ARGS_TEMPLATE.render()
-    else:
-        problem_args = problem_args_template.render()
+        problem_args_template = PROBLEM_ARGS_TEMPLATE
+
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
     return common_softmax.gen_function(
-        func_attrs,
-        common_softmax.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_softmax.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args_template.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        ),
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        dim_info_dict=dim_info_dict,
         emit_kernel=True,
-        support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.func_decl")
-def gen_function_decl(func_attrs):
+def gen_function_decl(
+    func_attrs,
+    **kwargs,
+):
     func_name = func_attrs["name"]
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+
     return common_softmax.FUNC_DECL_TEMPLATE.render(
         func_name=func_name,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
-        support_split_k=True,
+        **kwargs,
     )
 
 
 @registry.reg("cuda.gemm_rcr_softmax.func_call")
-def gen_function_call(func_attrs, indent="  "):
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+    **kwargs,
+):
     a = func_attrs["inputs"][0]
     b = func_attrs["inputs"][1]
-
-    tmp_c = func_attrs["inputs"][2]
-    tmp_d = func_attrs["inputs"][3]
-    tmp_n = func_attrs["inputs"][4]
-
     soft = func_attrs["outputs"][0]
-    has_bias = False
+
     adims = [
         "&" + dim._attrs["name"]
         for dim in func_attrs["input_accessors"][0].original_shapes
@@ -178,20 +205,17 @@ def gen_function_call(func_attrs, indent="  "):
         "&" + dim._attrs["name"]
         for dim in func_attrs["output_accessors"][0].original_shapes
     ]
+
     return common_softmax.FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         a_ptr=a._attrs["name"],
         b_ptr=b._attrs["name"],
-        has_bias=has_bias,
-        c_ptr=tmp_c._attrs["name"],
-        d_ptr=tmp_d._attrs["name"],
-        n_ptr=tmp_n._attrs["name"],
         soft_ptr=soft._attrs["name"],
-        split_k=func_attrs["split_k"],
         adims=adims,
         bdims=bdims,
         cdims=cdims,
         indent=indent,
+        **kwargs,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
index 4ad8ee10b..5d3ea3d0e 100644
--- a/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
+++ b/python/aitemplate/backend/cuda/gemm_epilogue_vistor/include/gemm_with_softmax.h
@@ -28,12 +28,13 @@ template <
     typename ArchTag,
     typename ElementAccumulator,
     int kStages,
-    typename ThreadblockShape,
+    typename ThreadblockShape_,
     typename WarpShape,
     typename InstructionShape,
     typename EpilogueFunctorOp,
     typename ThreadblockSwizzle,
-    typename ElementSum_ = ElementAccumulator,
+    typename ElementNorm_ = float,
+    typename ElementSum_ = float,
     typename ElementSoftmax_ = ElementC_>
 
 class GemmSoftmaxUniversal {
@@ -50,10 +51,13 @@ class GemmSoftmaxUniversal {
   using ElementCompute = ElementAccumulator;
   using ElementSum = ElementSum_;
   using ElementSoft = ElementSoftmax_;
+  using ElementSoftmaxCompute = float;
 
   using LayoutA = LayoutA_;
   using LayoutB = LayoutB_;
 
+  using ThreadblockShape = ThreadblockShape_;
+
   static int const kAlignment = kAlignmentA;
 
   ///////////////////////////////////////////////////////////////////////////////////////////////
@@ -71,35 +75,34 @@ class GemmSoftmaxUniversal {
   // This is a mandatory data type for the atomic reduction in the GEMM epilogue
   // to function.
 
-  using ElementN = float;
+  // using ElementN = float;
+  using ElementNorm = ElementNorm_;
+
+  using ApplyShape = MatrixShape<1, 1024>;
 
   // These are mandatory layouts.
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutN = cutlass::layout::RowMajor;
+  using LayoutS = cutlass::layout::RowMajor;
   using LayoutSoft = cutlass::layout::RowMajor;
 
   using TensorRefA = TensorRef<ElementA, LayoutA>;
   using TensorRefB = TensorRef<ElementB, LayoutB>;
   using TensorRefC = TensorRef<ElementC, LayoutC>;
-  using TensorRefN = TensorRef<ElementN, LayoutN>;
+  using TensorRefN = TensorRef<ElementNorm, LayoutN>;
+  using TensorRefSum = TensorRef<ElementSum, LayoutS>;
   using TensorRefSoft = TensorRef<ElementSoft, LayoutSoft>;
 
-  // using OperatorClass       = cutlass::arch::OpClassTensorOp;
-  // using ArchTag             = cutlass::arch::Sm80;
-  // static int const kStages  = Stages;
-  // using ThreadblockSwizzle =
-  // cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle;
-
   ///////////////////////////////////////////////////////////////////////////////////////////////
 
   // basic GEMM kernel
   using DefaultGemmKernel = typename cutlass::gemm::kernel::DefaultGemm<
       ElementA,
       LayoutA,
-      kAlignment,
+      kAlignmentA,
       ElementB,
       LayoutB,
-      kAlignment,
+      kAlignmentB,
       ElementC,
       LayoutC,
       ElementCompute,
@@ -124,12 +127,16 @@ class GemmSoftmaxUniversal {
   ///////////////////////////////////////////////////////////////////////////////////////////////
 
   // Epilogue visitor
-  using EpilogueVisitor = kernel::EpilogueVisitorBiasMax<
-      ThreadblockShape,
-      DefaultGemmKernel::kThreadCount,
-      typename DefaultGemmKernel::Epilogue::OutputTileIterator,
-      ElementCompute,
-      EpilogueFunctorOp>;
+  using EpilogueVisitor =
+      typename cutlass::epilogue::threadblock::EpilogueVisitorSoftmax<
+          ThreadblockShape,
+          DefaultGemmKernel::kThreadCount,
+          typename DefaultGemmKernel::Epilogue::OutputTileIterator,
+          ElementCompute,
+          ElementNorm,
+          ElementSum,
+          ElementSoftmaxCompute,
+          EpilogueFunctorOp>;
 
   /// Epilogue
   using Epilogue = typename cutlass::epilogue::threadblock::
@@ -146,11 +153,19 @@ class GemmSoftmaxUniversal {
   // Softmax kernel
   using SoftmaxApplyKernel = kernel::ApplySoftmax<
       ElementC,
-      ElementN,
+      ElementNorm,
       ElementSum,
       ElementSoft,
+      ElementSoftmaxCompute,
       kAlignmentC,
-      MatrixShape<1, 1024>>;
+      ApplyShape>;
+
+  using ApplyFinalReductionKernel =
+      cutlass::reduction::kernel::ApplySoftmaxFinalReduction<
+          ElementNorm,
+          ElementSum,
+          ElementSoftmaxCompute,
+          ThreadblockShape>;
 
  public:
   /// Arguments class
@@ -158,6 +173,8 @@ class GemmSoftmaxUniversal {
     typename GemmKernel::Arguments gemm;
 
     typename SoftmaxApplyKernel::Arguments softmax;
+    typename ApplyFinalReductionKernel::Arguments reduction;
+    cutlass::gemm::GemmCoord extent;
 
     //
     // Methods
@@ -173,12 +190,14 @@ class GemmSoftmaxUniversal {
         TensorRefC ref_D_,
         typename EpilogueFunctorOp::Params linear_scaling,
         TensorRefN ref_N_,
+        TensorRefSum ref_S_,
         TensorRefSoft ref_Softmax_,
         int64_t batch_stride_A_ = 0,
         int64_t batch_stride_B_ = 0,
         int64_t batch_stride_C_ = 0,
         int64_t batch_stride_D_ = 0,
         int64_t batch_stride_Max_ = 0,
+        int64_t batch_stride_Sum_ = 0,
         int64_t batch_stride_Softmax_ = 0)
         : gemm(
               cutlass::gemm::GemmUniversalMode::kBatched,
@@ -186,38 +205,55 @@ class GemmSoftmaxUniversal {
               batch_count_,
               ref_A_,
               ref_B_,
+              ref_C_,
+              ref_D_,
+              ref_N_.data(),
+              ref_S_.data(),
               batch_stride_A_,
               batch_stride_B_,
               typename EpilogueVisitor::Arguments(
                   linear_scaling,
-                  ref_C_,
-                  ref_D_,
-                  ref_N_.data(),
                   batch_stride_C_,
                   batch_stride_D_,
-                  batch_stride_Max_)),
+                  batch_stride_Max_,
+                  batch_stride_Sum_)),
+          reduction(
+              problem_size,
+              ref_N_.data(),
+              ref_S_.data(),
+              batch_stride_Max_,
+              batch_stride_Sum_),
           softmax(
               MatrixCoord(problem_size.m(), problem_size.n()),
               batch_count_,
               ref_D_,
               ref_N_,
+              ref_S_,
               ref_Softmax_,
               batch_stride_D_,
               batch_stride_Max_,
-              batch_stride_Softmax_) {}
+              batch_stride_Sum_,
+              batch_stride_Softmax_),
+          extent(problem_size) {}
   };
 
   struct Params {
     typename GemmKernel::Params gemm;
 
     typename SoftmaxApplyKernel::Params softmax;
+    typename ApplyFinalReductionKernel::Params reduction;
+    MatrixCoord extent;
 
     //
     // Methods
     //
     Params() {}
 
-    Params(Arguments const& args) : gemm(args.gemm), softmax(args.softmax) {}
+    Params(Arguments const& args)
+        : gemm(args.gemm),
+          reduction(args.reduction),
+          softmax(args.softmax),
+          extent(MatrixCoord(args.extent.m(), args.extent.n())) {}
   };
 
  public:
@@ -254,10 +290,52 @@ class GemmSoftmaxUniversal {
 
     int gemm_smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
+    cudaError_t result;
+
+    if (gemm_smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(
+          cutlass::Kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          gemm_smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
     cutlass::Kernel<GemmKernel>
         <<<gemm_grid, gemm_block, gemm_smem_size, stream>>>(params_.gemm);
 
-    cudaError_t result = cudaGetLastError();
+    result = cudaGetLastError();
+
+    if (result != cudaSuccess) {
+      return cutlass::Status::kErrorInternal;
+    }
+
+    //
+    // Launch the ApplyFinalReductionKernel
+    //
+
+    int thread_per_block = 128;
+    int block_per_row =
+        (params_.extent.row() + thread_per_block - 1) / thread_per_block;
+    if (block_per_row < 4) {
+      thread_per_block = 32;
+      block_per_row =
+          (params_.extent.row() + thread_per_block - 1) / thread_per_block;
+    }
+
+    dim3 final_reduction_grid(
+        block_per_row, 1, params_.softmax.args.batch_count);
+    dim3 final_reduction_block(thread_per_block);
+
+    Kernel<ApplyFinalReductionKernel>
+        <<<final_reduction_grid,
+           final_reduction_block,
+           sizeof(typename ApplyFinalReductionKernel::SharedStorage),
+           stream>>>(params_.reduction);
+
+    result = cudaGetLastError();
 
     if (result != cudaSuccess) {
       return cutlass::Status::kErrorInternal;
@@ -268,15 +346,18 @@ class GemmSoftmaxUniversal {
     //
 
     dim3 apply_block(
-        SoftmaxApplyKernel::Shape::kColumn, SoftmaxApplyKernel::Shape::kRow);
+        SoftmaxApplyKernel::ApplyShape::kColumn,
+        SoftmaxApplyKernel::ApplyShape::kRow);
 
-    int cta_rows = SoftmaxApplyKernel::Shape::kRow;
-    int cta_columns =
-        SoftmaxApplyKernel::Shape::kColumn * SoftmaxApplyKernel::kAlignment;
+    int threadblock_rows = SoftmaxApplyKernel::ApplyShape::kRow;
+    int threadblock_columns = SoftmaxApplyKernel::ApplyShape::kColumn *
+        SoftmaxApplyKernel::kAlignment;
 
     dim3 apply_grid(
-        (params_.softmax.args.extent.row() + cta_rows - 1) / cta_rows,
-        (params_.softmax.args.extent.column() + cta_columns - 1) / cta_columns,
+        (params_.softmax.args.extent.row() + threadblock_rows - 1) /
+            threadblock_rows,
+        (params_.softmax.args.extent.column() + threadblock_columns - 1) /
+            threadblock_columns,
         params_.softmax.args.batch_count);
 
     Kernel<SoftmaxApplyKernel>
diff --git a/python/aitemplate/backend/cuda/gemm_special/__init__.py b/python/aitemplate/backend/cuda/gemm_special/__init__.py
index 93043be2c..00f99c3c7 100644
--- a/python/aitemplate/backend/cuda/gemm_special/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_special/__init__.py
@@ -15,7 +15,17 @@
 """
 special gemm ops
 """
-from . import bmm_rcr_n1, bmm_rrr_k1_tanh, gemm_rrr_small_nk
+from aitemplate.backend.cuda.gemm_special import (
+    batched_dense_vec_jagged_2d_mul,
+    bmm_rcr_n1,
+    bmm_rrr_k1_tanh,
+    gemm_rrr_small_nk,
+)
 
 
-__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
+__all__ = [
+    "batched_dense_vec_jagged_2d_mul",
+    "bmm_rcr_n1",
+    "bmm_rrr_k1_tanh",
+    "gemm_rrr_small_nk",
+]
diff --git a/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py b/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..aa4267dd5
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_special/batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,251 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define batched_dense_vec_jagged_2d_mul codegen and CUDA kernel
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_offsets_str
+from aitemplate.backend.target import Target
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define WARP_SIZE 32
+#define MAX_THREADS 1024
+    """
+)
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}(
+    {{data_t}}* output,
+    const {{data_t}}* vectors,
+    const {{data_t}}* matrices,
+    {{offsets}}
+    {{index_type}} b, {{index_type}} h,
+    {{index_type}} n, {{index_type}} d
+) {
+  const int b_h_begin = blockIdx.x * blockDim.y + threadIdx.y;
+  const int b_h_step = gridDim.x * blockDim.y;
+  for (int b_h = b_h_begin; b_h < b * h; b_h += b_h_step) {
+    const int b_idx = b_h / h;
+    const int h_idx = b_h % h;
+
+    const {{index_type}} row_start = offsets.data[0][b_idx];
+    const {{index_type}} row_end = offsets.data[0][b_idx + 1];
+    const {{index_type}} length = min(row_end - row_start, n);
+    if (length == 0) {
+      for (int d_idx = threadIdx.x; d_idx < d; d_idx += blockDim.x) {
+        output[b_h * d + d_idx] = 0;
+      }
+    } else {
+      for (int d_idx = threadIdx.x; d_idx < d; d_idx += blockDim.x) {
+        {{acc_t}} acc =
+            {{acc_t}}(vectors[b_h * n] * matrices[row_start * h * d + h_idx * d + d_idx]);
+        for (int l = 1; l < length; ++l) {
+          acc += {{acc_t}}(vectors[b_h * n + l] * matrices[(row_start + l) * h * d + h_idx * d + d_idx]);
+        }
+        output[b_h * d + d_idx] = {{data_t}}(acc);
+      }
+    }
+  }
+}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(void* output, const void* vectors, const void* matrices, {{index_type}} b, {{index_type}} h, {{index_type}} n, {{index_type}} d, {{offsets_decl}} {{prefix}}Stream_t stream) {
+    if (b == 0 || d == 0) {
+      return;
+    }
+    int block_dim_x = std::min(static_cast<int>(std::ceil(static_cast<double>(d) / WARP_SIZE) * WARP_SIZE), MAX_THREADS);
+    int block_dim_y = MAX_THREADS / block_dim_x;
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(b * h) / block_dim_y));
+    {{func_name}}<<<block_size, dim3(block_dim_x, block_dim_y), 0, stream>>>(
+        reinterpret_cast<{{data_t}}*>(output),
+        reinterpret_cast<const {{data_t}}*>(vectors),
+        reinterpret_cast<const {{data_t}}*>(matrices),
+        {{offsets_call}}
+        b,
+        h,
+        n,
+        d
+    );
+}
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(void* output, const void* vectors, const void* matrices, {{index_type}} b, {{index_type}} h, {{index_type}} n, {{index_type}} d, {{offsets}} {{prefix}}Stream_t stream);
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}invoke_{{func_name}}({{output}}, {{vectors}}, {{matrices}}, {{b}}, {{h}}, {{n}}, {{d}}, {{offsets}} {{stream}});
+{{indent}}}
+    """
+)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+) -> str:
+    matrices = func_attrs["inputs"][1]
+
+    acc_t = "float"
+    if (
+        data_type in ["half", "bfloat16"]
+        and "use_fp16_acc" in Target.current()._kwargs
+        and Target.current()._kwargs["use_fp16_acc"]
+    ):
+        acc_t = data_type
+
+    kernel_func = KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        data_t=data_type,
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+        acc_t=acc_t,
+    )
+    return kernel_func
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.gen_function")
+def jagged_to_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates jagged_to_dense function definition."""
+
+    vectors = func_attrs["inputs"][0]
+    matrices = func_attrs["inputs"][1]
+    backend_spec = CUDASpec()
+
+    dtype = vectors.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs,
+        backend_spec.index_type,
+        data_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render()
+
+    function = FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        offsets_decl=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
+        data_t=data_type,
+    )
+    return function
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.func_decl")
+def jagged_to_dense_gen_function_decl(func_attrs) -> str:
+    """Generate jagged_to_dense function declaration."""
+
+    matrices = func_attrs["inputs"][1]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.batched_dense_vec_jagged_2d_mul.func_call")
+def jagged_to_dense_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate jagged_to_dense function call."""
+
+    vectors = func_attrs["inputs"][0]
+    vshape = vectors._attrs["shape"]
+    matrices = func_attrs["inputs"][1]
+    jshape = matrices._attrs["shape"]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        matrices=matrices._attrs["name"],
+        vectors=vectors._attrs["name"],
+        b=vshape[0]._attrs["name"],
+        h=vshape[1]._attrs["name"],
+        n=vshape[2]._attrs["name"],
+        d=jshape[2]._attrs["name"],
+        output=output._attrs["name"],
+        offsets=gen_offsets_str(
+            matrices._attrs["shape"][0],
+            has_type=False,
+            const_ref=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
index 42e203069..7cc5f1b58 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rcr_n1.py
@@ -29,13 +29,13 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common, tensor_accessor_codegen
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common, tensor_accessor_codegen
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.compiler.base import IntImm
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -112,9 +112,13 @@
 SRC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
+using bfloat16 = __nv_bfloat16;
+using bfloat16_2 =  __nv_bfloat162;
+
 namespace {
 
 {{tensor_accessor_libs}}
@@ -172,6 +176,181 @@
   return true;
 }
 
+namespace detail {
+  template<typename TInput>
+  struct InputHelper;
+
+  template<>
+  struct InputHelper<float>{
+    typedef float scalar_type;
+    typedef float2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return make_float2(__fmaf_rn(a.x, b.x, c.x), __fmaf_rn(a.y, b.y, c.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __fmaf_rn(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return make_float2(__fmul_rn(a.x, b.x), __fmul_rn(a.y, b.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __fmul_rn(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return make_float2(__fadd_rn(a.x, b.x), __fadd_rn(a.y, b.y));
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __fadd_rn(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return a.x;
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return a.y;
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return a.x;
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return a.y;
+    }
+  };
+
+  template<>
+  struct InputHelper<half>{
+    typedef half scalar_type;
+    typedef half2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return __hfma2(a, b, c);
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __hfma(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return __hmul2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __hmul(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return __hadd2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __hadd(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return __low2half(a);
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return __high2half(a);
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return __low2float(a);
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return __high2float(a);
+    }
+  };
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+  template<>
+  struct InputHelper<bfloat16> {
+    typedef bfloat16 scalar_type;
+    typedef bfloat16_2 vec2_type;
+
+    static
+    __inline__ __device__ vec2_type fma2(vec2_type a, vec2_type b, vec2_type c) {
+      return __hfma2(a, b, c);
+    }
+
+    static
+    __inline__ __device__ scalar_type fma(scalar_type a, scalar_type b, scalar_type c) {
+      return __hfma(a, b, c);
+    }
+
+    static
+    __inline__ __device__ vec2_type mul2(vec2_type a, vec2_type b) {
+      return __hmul2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type mul(scalar_type a, scalar_type b) {
+      return __hmul(a, b);
+    }
+
+    static
+    __inline__ __device__ vec2_type add2(vec2_type a, vec2_type b) {
+      return __hadd2(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type add(scalar_type a, scalar_type b) {
+      return __hadd(a, b);
+    }
+
+    static
+    __inline__ __device__ scalar_type low(vec2_type a) {
+      return __low2bfloat16(a);
+    }
+
+    static
+    __inline__ __device__ scalar_type high(vec2_type a) {
+      return __high2bfloat16(a);
+    }
+
+    static
+    __inline__ __device__ float lowf(vec2_type a) {
+      return __low2float(a);
+    }
+
+    static
+    __inline__ __device__ float highf(vec2_type a) {
+      return __high2float(a);
+    }
+  }; // struct InputHelper<bfloat16>
+#endif
+} // namespace detail
+
 // Each thread reads one row from "a" and one column from "b",
 // computes dot_product(a_row, b_col), and writes the result to "c".
 // This kernel assumes loading "a" and "b" can be fully vectorized,
@@ -204,14 +383,16 @@
 
   float result = 0.0;
 
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
-    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
-    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    auto* a_vec_h2 = reinterpret_cast<const vec2_type*>(&a_vec[i]);
+    auto* b_vec_h2 = reinterpret_cast<const vec2_type*>(&b_vec[i]);
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
-      half2 c_h2 = __hmul2(a_vec_h2[j], b_vec_h2[j]);
-      result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+      auto c_h2 = dispatch::mul2(a_vec_h2[j], b_vec_h2[j]);
+      result += dispatch::lowf(c_h2) + dispatch::highf(c_h2);
     }
   }
 
@@ -295,16 +476,18 @@
 
   float result = 0.0;
 
-  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
-  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+
+  auto* a_data_h2 = reinterpret_cast<const vec2_type*>(&a_data[0]);
+  auto* b_data_h2 = reinterpret_cast<const vec2_type*>(&b_data[0]);
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < K / 2; ++i) {
-    half2 c_h2 = __hmul2(a_data_h2[i], b_data_h2[i]);
-    result += float(__low2half(c_h2)) + float(__high2half(c_h2));
+    auto c_h2 = dispatch::mul2(a_data_h2[i], b_data_h2[i]);
+    result += dispatch::lowf(c_h2) + dispatch::highf(c_h2);
   }
   if (K % 2) {
-    result += float(__hmul(reinterpret_cast<half&>(a_data[K-1]),
-                           reinterpret_cast<half&>(b_data[K-1])));
+    result += float(dispatch::mul(a_data[K-1], b_data[K-1]));
   }
 
   int64_t batch_idx = blockIdx.y;
@@ -339,19 +522,21 @@
     return;
   }
 
-  half2 result_h2 = {0.0, 0.0};
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+  vec2_type result_h2 = {0.0, 0.0};
 
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < N_NUM_ELEMS_IN_V; i++) {
-    const half2* a_vec_h2 = reinterpret_cast<const half2*>(&a_vec[i]);
-    const half2* b_vec_h2 = reinterpret_cast<const half2*>(&b_vec[i]);
+    auto* a_vec_h2 = reinterpret_cast<const vec2_type*>(&a_vec[i]);
+    auto* b_vec_h2 = reinterpret_cast<const vec2_type*>(&b_vec[i]);
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < N_READ_ELEMS_IN_V / 2; ++j) {
-      result_h2 = __hfma2(a_vec_h2[j], b_vec_h2[j], result_h2);
+      result_h2 = dispatch::fma2(a_vec_h2[j], b_vec_h2[j], result_h2);
     }
   }
 
-  float result = __hadd(__low2half(result_h2), __high2half(result_h2));
+  float result = float(dispatch::add(dispatch::low(result_h2), dispatch::high(result_h2)));
 
   int64_t batch_idx = blockIdx.y;
   int64_t row_idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -378,20 +563,21 @@
     return;
   }
 
-  half2 result_h2 = {0.0, 0.0};
+  using dispatch = typename detail::InputHelper<ElemT>;
+  using vec2_type = typename dispatch::vec2_type;
+
+  vec2_type result_h2 = {0.0, 0.0};
 
-  const half2* a_data_h2 = reinterpret_cast<const half2*>(&a_data[0]);
-  const half2* b_data_h2 = reinterpret_cast<const half2*>(&b_data[0]);
+  const auto* a_data_h2 = reinterpret_cast<const vec2_type*>(&a_data[0]);
+  const auto* b_data_h2 = reinterpret_cast<const vec2_type*>(&b_data[0]);
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < K / 2; ++i) {
-    result_h2 = __hfma2(a_data_h2[i], b_data_h2[i], result_h2);
+    result_h2 = dispatch::fma2(a_data_h2[i], b_data_h2[i], result_h2);
   }
 
-  half result = __hadd(__low2half(result_h2), __high2half(result_h2));
+  auto result = dispatch::add(dispatch::low(result_h2), dispatch::high(result_h2));
   if (K % 2) {
-    result = __hfma(reinterpret_cast<const half&>(a_data[K-1]),
-                    reinterpret_cast<const half&>(b_data[K-1]),
-                    result);
+    result = dispatch::fma(a_data[K-1], b_data[K-1], result);
   }
 
   int64_t batch_idx = blockIdx.y;
@@ -497,12 +683,17 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
     assert ak == bk, f"ak is not equal to bk. ak: {ak}, bk: {bk}"
 
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_backend_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    vec_lens = list(zip(*backend_spec.read_num_elements_to_backend_type))[0][:-1]
+    dtype = func_attrs["inputs"][0].dtype()
+    elem_input_type = backend_spec.dtype_to_backend_type(dtype)
+    vec_lens = [8, 4, 2]
+    # Each corresponds to a vec_len in the list above
+    backend_types = [
+        "uint4",
+        "uint2",
+        "uint",
+    ]
     alignment = tensor_accessor_codegen.find_max_alignment(
-        ak, func_attrs["input_accessors"]
+        ak, dtype, func_attrs["input_accessors"]
     )
     if alignment % 2:
         bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc"
@@ -513,9 +704,7 @@ def _get_original_dim_val(func_attrs, input_idx, dim):
             if ak % vec_len == 0:
                 bmm_rcr_n1_kernel_fp32 = "bmm_rcr_n1_kernel_fp32_acc_vec"
                 bmm_rcr_n1_kernel_fp16 = "bmm_rcr_n1_kernel_fp16_acc_vec"
-                read_vec_type = backend_spec.read_num_elements_to_backend_type[vec_idx][
-                    1
-                ]
+                read_vec_type = backend_types[vec_idx]
                 break
 
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
diff --git a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
index eb5cfe109..797028e69 100644
--- a/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_special/bmm_rrr_k1_tanh.py
@@ -22,10 +22,10 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -89,24 +89,27 @@
     """
 #include <iostream>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/fast_math.h"
 
-#ifndef __HALF_TO_US
-#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+using bfloat16 = __nv_bfloat16;
+
+#ifndef REINTERPRET_AS_U16
+#define REINTERPRET_AS_U16(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #endif
 
 namespace {
 
 template <typename T>
-__device__ T fast_tanh(T x);
+__device__ __inline__ T fast_tanh(T x);
 
 template <>
-__device__ half fast_tanh(half x) {
+__device__ __inline__ half fast_tanh(half x) {
   #if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 750)
 
-  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(__HALF_TO_US(x)) : "h"(__HALF_TO_US(x)));
+  asm volatile ( "tanh.approx.f16 %0, %1;" : "=h"(REINTERPRET_AS_U16(x)) : "h"(REINTERPRET_AS_U16(x)));
   return x;
 
   #else
@@ -114,6 +117,49 @@
   #endif
 }
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+
+template <>
+__device__ __inline__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDA_ARCH__ >= 900)
+
+  asm volatile ( "tanh.approx.bf16 %0, %1;" : "=h"(REINTERPRET_AS_U16(x)) : "h"(REINTERPRET_AS_U16(x)));
+  return x;
+
+#else
+  return bfloat16(cutlass::fast_tanh(float(x)));
+#endif
+}
+
+#endif // (__CUDA_ARCH__ >= 800)
+
+template <>
+__device__ __inline__ float fast_tanh(float x) {
+  return cutlass::fast_tanh(x);
+}
+
+template<typename ElemT>
+__device__ __inline__ ElemT intrinsic_mul(ElemT x, ElemT y);
+
+template<>
+__device__ __inline__ float intrinsic_mul(float x, float y) {
+  return __fmul_rn(x, y);
+}
+
+template<>
+__device__ __inline__ half intrinsic_mul(half x, half y) {
+  return __hmul(x, y);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && (__CUDA_ARCH__ >= 800)
+
+template<>
+__device__ __inline__ bfloat16 intrinsic_mul(bfloat16 x, bfloat16 y) {
+  return __hmul(x, y);
+}
+
+#endif
+
 template<typename ElemT, int num_thread>
 __global__ void bmm_rrr_k1_tanh_kernel(const float4* a_ptr,
                                   const float4* b_ptr,
@@ -137,7 +183,7 @@
     for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < num_elems_in_float4; ++j) {
-        tmp[i * num_elems_in_float4 + j] = fast_tanh(__hmul(a_vec_ptr[i], b_vec_ptr[j]));
+        tmp[i * num_elems_in_float4 + j] = fast_tanh(intrinsic_mul(a_vec_ptr[i], b_vec_ptr[j]));
       }
     }
     CUTLASS_PRAGMA_UNROLL
@@ -207,6 +253,7 @@
   {{exec_paths}}
 }
 
+#undef REINTERPRET_AS_U16
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
index b53b74f37..b4ec05077 100644
--- a/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/backend/cuda/gemm_special/gemm_rrr_small_nk.py
@@ -27,11 +27,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
-from ..gemm_universal import common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -100,21 +100,39 @@
 #include <iostream>
 #include <type_traits>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
 
 namespace {
 
+using bfloat16 = __nv_bfloat16;
+
+__device__ float fma(float a, float b, float c) {
+  return __fmaf_rn(a, b, c);
+}
+
+__device__ half fma(half a, half b, half c) {
+  return __hfma(a, b, c);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c) {
+  return __hfma(a, b, c);
+}
+#endif
+
 // For each thread, read
 // A tile: 8 x K
 // B matrix: K x N
 // C tile: 8 x N
-template<int num_thread, int N, int K, bool USE_FP16_ACC>
-__global__ void gemm_rrr_small_nk_kernel_half(
-    float4* a_ptr, float4* b_ptr, float4* c_ptr, int M) {
+template<typename TElem, int num_thread, int N, int K, bool USE_FP16_ACC>
+__global__ void gemm_rrr_small_nk_kernel(
+    const float4* a_ptr, const float4* b_ptr, float4* c_ptr, int M) {
   int idx = blockIdx.x * num_thread + threadIdx.x;
+  constexpr int num_elems_in_float4 = sizeof(float4) / sizeof(TElem);
 
-  if (idx >= (M + 7) / 8) {
+  if (idx >= (M + num_elems_in_float4 - 1) / num_elems_in_float4) {
     return;
   }
 
@@ -122,20 +140,20 @@
   a_ptr += a_idx_base;
 
   // load b matrix
-  half b[K][N];
-  half* b_half = reinterpret_cast<half*>(b_ptr);
+  TElem b[K][N];
+  auto* b_e = reinterpret_cast<const TElem*>(b_ptr);
   for (int i = 0; i < K; ++i) {
     for (int j = 0; j < N; ++j) {
-      b[i][j] = b_half[i * N + j];
+      b[i][j] = b_e[i * N + j];
     }
   }
 
   int c_idx_base = idx * N;
   c_ptr += c_idx_base;
 
-  half c_tile[8][N];
+  TElem c_tile[num_elems_in_float4][N];
 
-  if (idx <= M / 8 - 1) {
+  if (idx <= M / num_elems_in_float4 - 1) {
     // fast kernel
     // load a
     float4 a_tile_vec[K];
@@ -143,27 +161,35 @@
     for (int i = 0; i < K; i++) {
       a_tile_vec[i] = __ldg(a_ptr++);
     }
-    half* a_tile = reinterpret_cast<half*>(&a_tile_vec);
+    auto* a_tile = reinterpret_cast<const TElem*>(&a_tile_vec);
 
     // compute
     CUTLASS_PRAGMA_UNROLL
-    for (int i = 0; i < 8; ++i) {
+    for (int i = 0; i < num_elems_in_float4; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
-            sum = __hfma(a_tile[i * K + k], b[k][j], sum);
+            sum = fma(a_tile[i * K + k], b[k][j], sum);
           }
           c_tile[i][j] = sum;
         } else {
           float sum = 0;
-          CUTLASS_PRAGMA_UNROLL
-          for (int k = 0; k < K; ++k) {
-            sum += __half2float(__hmul(a_tile[i * K + k], b[k][j]));
+          if constexpr (std::is_same_v<TElem, half>) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __half2float(__hmul(a_tile[i * K + k], b[k][j]));
+            }
+            c_tile[i][j] = __float2half_rn(sum);
+          } else {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __fmul_rn(a_tile[i * K + k], b[k][j]);
+            }
+            c_tile[i][j] = sum;
           }
-          c_tile[i][j] = __float2half_rn(sum);
         }
       }
     }
@@ -177,14 +203,14 @@
   } else {
     // process tail
     // load a
-    half* a_h = reinterpret_cast<half*>(a_ptr);
-    int m = M - M / 8 * 8;
-    half a_tile[8][K];
+    auto* a_e = reinterpret_cast<const TElem*>(a_ptr);
+    int m = M - M / num_elems_in_float4 * num_elems_in_float4;
+    TElem a_tile[num_elems_in_float4][K];
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < m; i++) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < K; j++) {
-        a_tile[i][j] = a_h[i * K + j];
+        a_tile[i][j] = a_e[i * K + j];
       }
     }
 
@@ -193,26 +219,35 @@
     for (int i = 0; i < m; ++i) {
       CUTLASS_PRAGMA_UNROLL
       for (int j = 0; j < N; ++j) {
-        if (USE_FP16_ACC) {
-          half sum = 0;
+        if constexpr (USE_FP16_ACC) {
+          TElem sum = 0;
           CUTLASS_PRAGMA_UNROLL
           for (int k = 0; k < K; ++k) {
-            sum = __hfma(a_tile[i][k], b[k][j], sum);
+            sum = fma(a_tile[i][k], b[k][j], sum);
           }
           c_tile[i][j] = sum;
         } else {
           float sum = 0;
-          CUTLASS_PRAGMA_UNROLL
-          for (int k = 0; k < K; ++k) {
-            sum += __half2float(__hmul(a_tile[i][k], b[k][j]));
+          if constexpr (std::is_same_v<TElem, half>) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __half2float(__hmul(a_tile[i][k], b[k][j]));
+            }
+            c_tile[i][j] = __float2half_rn(sum);
+          }
+          else {
+            CUTLASS_PRAGMA_UNROLL
+            for (int k = 0; k < K; ++k) {
+              sum += __fmul_rn(a_tile[i][k], b[k][j]);
+            }
+            c_tile[i][j] = sum;
           }
-          c_tile[i][j] = __float2half_rn(sum);
         }
       }
     }
 
     // write c
-    half* c_h = reinterpret_cast<half*>(c_ptr);
+    auto* c_h = reinterpret_cast<TElem*>(c_ptr);
     CUTLASS_PRAGMA_UNROLL
     for (int i = 0; i < m; i++) {
       CUTLASS_PRAGMA_UNROLL
@@ -224,7 +259,8 @@
 }
 
 // N <= 8, K <= 8
-template<typename ElemT, int N, int K>
+template<typename ElemT, int N, int K,
+         typename = std::enable_if_t<std::is_same_v<ElemT, float> || std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>, void>>
 void gemm_rrr_small_nk_launcher(ElemT* a_ptr,
                          ElemT* b_ptr,
                          ElemT* c_ptr,
@@ -236,27 +272,20 @@
   dim3 thread_block(nthread);
   constexpr int n_element_per_t = nthread * num_elems_in_float4;
   dim3 grid((M + n_element_per_t - 1) / n_element_per_t);
-  if constexpr (std::is_same<ElemT, half>::value) {
-    if(use_fp16_acc) {
-      gemm_rrr_small_nk_kernel_half<nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
-        (float4*)a_ptr,
-        (float4*)b_ptr,
-        (float4*)c_ptr,
-        M
-      );
-    } else {
-      gemm_rrr_small_nk_kernel_half<nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
-        (float4*)a_ptr,
-        (float4*)b_ptr,
-        (float4*)c_ptr,
-        M
-      );
-    }
+  if (use_fp16_acc && (std::is_same_v<ElemT, half> || std::is_same_v<ElemT, bfloat16>)) {
+    gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, true><<<grid, thread_block, 0, stream>>>(
+      reinterpret_cast<const float4*>(a_ptr),
+      reinterpret_cast<const float4*>(b_ptr),
+      reinterpret_cast<float4*>(c_ptr),
+      M
+    );
   } else {
-    auto msg = std::string("Got error: unsupported elem type ") +
-      " at " + __FILE__ + ": " + std::to_string(__LINE__);
-    std::cerr << msg << std::endl;
-    throw std::runtime_error(msg);
+    gemm_rrr_small_nk_kernel<ElemT, nthread, N, K, false><<<grid, thread_block, 0, stream>>>(
+      reinterpret_cast<const float4*>(a_ptr),
+      reinterpret_cast<const float4*>(b_ptr),
+      reinterpret_cast<float4*>(c_ptr),
+      M
+    );
   }
 }
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/__init__.py b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
index 9d04403bc..c77d279cc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/__init__.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/__init__.py
@@ -13,38 +13,26 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from . import (
-    bmm_ccr,
-    bmm_ccr_add,
-    bmm_crr,
-    bmm_crr_add,
-    bmm_rcr,
+from aitemplate.backend.cuda.gemm_universal import (
     bmm_rcr_permute,
-    bmm_rrr,
-    bmm_rrr_add,
     bmm_rrr_permute,
-    gemm_rcr,
+    bmm_xxx,
+    bmm_xxx_add,
     gemm_rcr_bias,
-    gemm_rcr_bias_add,
-    gemm_rcr_bias_add_add,
-    gemm_rcr_bias_add_add_relu,
-    gemm_rcr_bias_add_relu,
+    gemm_rcr_bias_elementwise,
     gemm_rcr_bias_fast_gelu,
     gemm_rcr_bias_gelu,
     gemm_rcr_bias_hardswish,
-    gemm_rcr_bias_mul,
-    gemm_rcr_bias_mul_add,
-    gemm_rcr_bias_mul_tanh,
     gemm_rcr_bias_permute,
     gemm_rcr_bias_relu,
     gemm_rcr_bias_sigmoid,
-    gemm_rcr_bias_sigmoid_mul,
-    gemm_rcr_bias_sigmoid_mul_tanh,
     gemm_rcr_bias_swish,
     gemm_rcr_bias_tanh,
     gemm_rcr_fast_gelu,
     gemm_rcr_permute,
+    gemm_rcr_permute_elup1,
     gemm_rrr,
+    gemm_rrr_bias,
     gemm_rrr_permute,
     group_gemm_rcr,
     group_gemm_rcr_bias,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
deleted file mode 100644
index b8e3fa6c1..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Codegen for bmm_ccr, which computes A @ B + bias.
-A[ColMajor], B[ColMajor], bias[RowMajor]
-"""
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "M",
-        "ldb": "K",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_ccr.config")
-def bmm_ccr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_ccr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_ccr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_ccr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_ccr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_ccr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
deleted file mode 100644
index fe8e605f0..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_ccr_add.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Codegen for bmm_ccr_add, which computes A @ B + bias + C.
-A[ColMajor], B[ColMajor], bias / C[RowMajor]
-"""
-from ... import registry
-from ...common import gemm_common
-from . import bmm_ccr, bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_ccr_add.config")
-def bmm_ccr_add_config(func_attrs, dtype="float16"):
-    return bmm_ccr.bmm_ccr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_ccr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="(d_ptr)",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_ccr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = bmm_ccr._get_problem_info(
-        bias_ptr="(d_ptr)",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_ccr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_ccr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_ccr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
index 6a00b0fc5..bd3affc47 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_common.py
@@ -15,13 +15,13 @@
 """
 Common functions and templates for bmm-family ops
 """
-from dataclasses import dataclass
+import dataclasses
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -135,13 +135,18 @@
 
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim2;
+{%endif%}
+{% if has_d %}
+  one_copy_sz += c_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 {% if has_bias %}
   memory_pool->AllocateTensor(c_dim2, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
@@ -152,11 +157,13 @@
 )
 
 
-@dataclass
+@dataclasses.dataclass
 class Bmm_problem_info:
     alpha_value: float = 1
     beta_value: float = 0
-    problem_size: str = "{M, N, K}"
+    problem_dim_0: str = "M"
+    problem_dim_1: str = "N"
+    problem_dim_2: str = "K"
     batch_size: str = "B"
     a_ptr: str = "a_ptr"
     b_ptr: str = "b_ptr"
@@ -170,6 +177,9 @@ class Bmm_problem_info:
     ldb: str = "0"
     ldbias: str = "0"
     ldc: str = "0"
+    a_row_major: bool = True
+    b_row_major: bool = False
+    c_row_major: bool = True
 
 
 def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
@@ -189,22 +199,69 @@ def _update_stride_info(mm_info, a_shapes, b_shapes, bias_shapes=None):
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kBatched,
-    {{mm_info.problem_size}},
-    {{mm_info.batch_size}},
-    {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},
-    {{mm_info.a_ptr}},
-    {{mm_info.b_ptr}},
-    {{mm_info.bias_ptr}},
-    {{mm_info.c_ptr}},
-    {{mm_info.a_batch_stride}},
-    {{mm_info.b_batch_stride}},
-    {{mm_info.bias_batch_stride}},
-    {{mm_info.c_batch_stride}},
-    {{mm_info.lda}},
-    {{mm_info.ldb}},
-    {{mm_info.ldbias}},
-    {{mm_info.ldc}}
+    cutlass::gemm::GemmUniversalMode::kBatched,                                                         // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{mm_info.problem_dim_0}}),
+        static_cast<coord_t>({{mm_info.problem_dim_1}}),
+        static_cast<coord_t>({{mm_info.problem_dim_2}})
+    },                                                                                                  // GemmCoord problem_size
+    {{mm_info.batch_size}},                                                                             // int batch_count
+    {ElementComputeEpilogue({{mm_info.alpha_value}}), ElementComputeEpilogue({{mm_info.beta_value}})},  // typename EpilogueOutputOp::Params epilogue
+    {{mm_info.a_ptr}},                                                                                  // void const * ptr_A
+    {{mm_info.b_ptr}},                                                                                  // void const * ptr_B
+    {{mm_info.bias_ptr}},                                                                               // void const * ptr_C
+    {{mm_info.c_ptr}},                                                                                  // void * ptr_D
+    {{mm_info.a_batch_stride}},                                                                         // int64_t batch_stride_A
+    {{mm_info.b_batch_stride}},                                                                         // int64_t batch_stride_B
+    {{mm_info.bias_batch_stride}},                                                                      // int64_t batch_stride_C
+    {{mm_info.c_batch_stride}},                                                                         // int64_t batch_stride_D
+    {{mm_info.lda}},                                                                                    // typename LayoutA::Stride::LongIndex lda
+    {{mm_info.ldb}},                                                                                    // typename LayoutB::Stride::LongIndex ldb
+    {{mm_info.ldbias}},                                                                                 // typename LayoutC::Stride::LongIndex ldc
+    {{mm_info.ldc}},                                                                                    // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kBatched,                                 // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{mm_info.problem_dim_0}}),
+        static_cast<coord_t>({{mm_info.problem_dim_1}}),
+        static_cast<coord_t>({{mm_info.problem_dim_2}}),
+        static_cast<coord_t>({{mm_info.batch_size}})
+    },                                                                          // ProblemShape problem_shape
+    {{mm_info.a_ptr}},                                                          // ElementA const* ptr_A
+{% if mm_info.a_row_major %}
+    { {{mm_info.lda}}, cute::Int<1>{}, {{mm_info.a_batch_stride}} },            // StrideA dA
+{% else %}
+    { cute::Int<1>{}, {{mm_info.lda}}, {{mm_info.a_batch_stride}} },            // StrideA dA
+{% endif %}
+    {{mm_info.b_ptr}},                                                          // ElementB const* ptr_B
+{% if mm_info.b_row_major %}
+    { cute::Int<1>{}, {{mm_info.ldb}}, {{mm_info.b_batch_stride}} },            // StrideB dB
+{% else %}
+    { {{mm_info.ldb}}, cute::Int<1>{}, {{mm_info.b_batch_stride}} },            // StrideB dB
+{% endif %}
+    {
+        {
+            ElementComputeEpilogue({{mm_info.alpha_value}}),
+            ElementComputeEpilogue({{mm_info.beta_value}})
+        },                                                                      // typename ThreadEpilogueOp::Params thread
+        {{mm_info.bias_ptr}},                                                   // ElementC const* ptr_C
+{% if mm_info.c_row_major %}
+        { {{mm_info.ldbias}}, cute::Int<1>{}, {{mm_info.bias_batch_stride}} },  // StrideC dC
+{% else %}
+        { cute::Int<1>{}, {{mm_info.ldbias}}, {{mm_info.bias_batch_stride}} },  // StrideC dC
+{% endif %}
+        {{mm_info.c_ptr}},                                                      // ElementD const* ptr_D
+{% if mm_info.c_row_major %}
+        { {{mm_info.ldc}}, cute::Int<1>{}, {{mm_info.c_batch_stride}} },        // StrideD dD
+{% else %}
+        { cute::Int<1>{}, {{mm_info.ldc}}, {{mm_info.c_batch_stride}} },        // StrideD dD
+{% endif %}
+    },                                                                          // EpilogueArguments epilogue
 """
 )
 
@@ -233,6 +290,150 @@ def _fill(arr, idx, val):
     return ret
 
 
+def get_default_problem_info(default_problem_args, **kwargs):
+    """Return the default problem args"""
+    problem_args = default_problem_args.copy()
+    for k, v in kwargs.items():
+        problem_args[k] = v
+
+    bmm_problem_info = Bmm_problem_info(**problem_args)
+    return bmm_problem_info
+
+
+def make_function_strided_args(
+    func_attrs,
+    dim_info_dict,
+    default_mm_info,
+    is_permute=False,
+):
+    """
+    Return a tuple of (problem_args, input_addr_calculator, output_addr_calculator)
+    """
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    input_a_batch_stride_dim = default_mm_info.a_batch_stride
+    input_a_stride_lda_dim = default_mm_info.lda
+    input_a_offset = 0
+    input_b_batch_stride_dim = default_mm_info.b_batch_stride
+    input_b_stride_ldb_dim = default_mm_info.ldb
+    input_b_offset = 0
+
+    has_bias = len(func_attrs["inputs"]) == 3
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            if not input_a_accessor.is_contiguous:
+                a_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 0
+                )
+
+                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
+                input_a_stride_lda_dim = input_a_accessor.stride(1)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            if not input_b_accessor.is_contiguous:
+                b_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.INPUT, 1
+                )
+                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
+                input_b_stride_ldb_dim = input_b_accessor.stride(1)
+
+        if has_bias:
+            # FIXME: we don't suppor strided bias yet. Will enable it once
+            # we support it.
+            input_bias_accessor = func_attrs["input_accessors"][2]
+            assert (
+                not input_bias_accessor.is_from_strided_tensor
+            ), f'strided bias is not supported for op {func_attrs["name"]}'
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_lda_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_ldb_dim,
+        input_b_offset_val=input_b_offset,
+    )
+
+    # bmm_permute requires a slightly different c_batch_stride and
+    # output_batch_stride_dim values
+    if is_permute:
+        output_batch_stride_dim = default_mm_info.bias_batch_stride
+        c_batch_stride = default_mm_info.c_batch_stride
+    else:
+        output_batch_stride_dim = default_mm_info.c_batch_stride
+        c_batch_stride = "output_batch_stride"
+    output_stride_ldc_dim = default_mm_info.ldc
+    output_offset = 0
+
+    if "output_accessors" in func_attrs:
+        output_accessor = func_attrs["output_accessors"][0]
+        if output_accessor.is_from_strided_tensor:
+            output_offset = output_accessor.offset
+            if not output_accessor.is_contiguous:
+                c_dims = reverse_dim_info_mapping(
+                    dim_info_dict, gemm_common.Source.OUTPUT, 0
+                )
+                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
+                output_stride_ldc_dim = output_accessor.stride(1)
+
+    output_addr_calculator = OUTPUT_ADDR_CALCULATOR.render(
+        output_batch_stride_dim=output_batch_stride_dim,
+        output_stride_dim=output_stride_ldc_dim,
+        output_offset_val=output_offset,
+    )
+
+    bmm_problem_info = Bmm_problem_info(
+        alpha_value=default_mm_info.alpha_value,
+        beta_value=default_mm_info.beta_value,
+        a_ptr=f"({elem_input_type}*)({default_mm_info.a_ptr}) + input_a_offset",
+        b_ptr=f"({elem_input_type}*)({default_mm_info.b_ptr}) + input_b_offset",
+        bias_ptr=f"({elem_output_type}*)({default_mm_info.bias_ptr})",
+        c_ptr=f"({elem_output_type}*)({default_mm_info.c_ptr}) + output_offset",
+        a_batch_stride="input_a_batch_stride",
+        b_batch_stride="input_b_batch_stride",
+        bias_batch_stride=f"{default_mm_info.bias_batch_stride}",
+        c_batch_stride=c_batch_stride,
+        lda="input_a_stride",
+        ldb="input_b_stride",
+        ldbias=f"{default_mm_info.ldbias}",
+        ldc="output_stride",
+        a_row_major=default_mm_info.a_row_major,
+        b_row_major=default_mm_info.b_row_major,
+        c_row_major=default_mm_info.c_row_major,
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    d_shapes = None
+    if has_bias:
+        d_shapes = func_attrs["input_accessors"][2].original_shapes
+    _update_stride_info(bmm_problem_info, a_shapes, b_shapes, d_shapes)
+
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=bmm_problem_info,
+    )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_problem_info,
+    )
+    return (
+        problem_args,
+        problem_args_cutlass_3x,
+        input_addr_calculator,
+        output_addr_calculator,
+    )
+
+
 def gen_profiler(
     func_attrs,
     workdir,
@@ -242,9 +443,14 @@ def gen_profiler(
     problem_args,
     args_parser,
     bias_ptr_arg=None,
+    problem_args_cutlass_3x="",
 ):
+    import cutlass_lib
+
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    op_instance, _ = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -271,6 +477,7 @@ def gen_profiler(
         instance=instance_name_base,
         is_profiler=True,
         problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=a_ndims,
@@ -283,11 +490,21 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = common.emit_instance(op, for_profiler=True)
-        config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            common.INSTANCE_TEMPLATE_CUTLASS_3X
+            if cutlass_3x
+            else common.INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=common.extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -295,13 +512,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
-            has_d=has_d,
             adims=a_dims_ptr,
             bdims=b_dims_ptr,
             cdims=c_dims_ptr,
@@ -326,12 +536,12 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
-        d_ptr="d_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
         has_d=has_d,
         a_dims_ptr=benchmark_adims,
         b_dims_ptr=benchmark_bdims,
@@ -366,6 +576,76 @@ def gen_profiler(
     return common.build_profiler(file_pairs)
 
 
+def add_elem_types_to_mm_info(mm_info, func_attrs):
+    """
+    CUTLASS 3.x problem args require explicit I/O pointer types
+    (not void*). This function arugments the input and output
+    pointers in the mm_info with the appropriate elem_input_type
+    and elem_output_type casts.
+    """
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+
+    return dataclasses.replace(
+        mm_info,
+        a_ptr=f"({elem_input_type}*)({mm_info.a_ptr})",
+        b_ptr=f"({elem_input_type}*)({mm_info.b_ptr})",
+        bias_ptr=f"({elem_output_type}*)({mm_info.bias_ptr})",
+        c_ptr=f"({elem_output_type}*)({mm_info.c_ptr})",
+    )
+
+
+def default_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    default_problem_args,
+):
+    """default function for generating bmm profilers"""
+    a_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.INPUT, 0)
+    b_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.INPUT, 1)
+    c_dims = reverse_dim_info_mapping(dim_info_dict, gemm_common.Source.OUTPUT, 0)
+
+    args_parser = ARGS_PARSER_TEMPLATE.render(
+        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+    )
+
+    default_mm_info = get_default_problem_info(
+        default_problem_args,
+        alpha_value=func_attrs.get("alpha", 1),
+    )
+    a_shapes = func_attrs["input_accessors"][0].original_shapes
+    b_shapes = func_attrs["input_accessors"][1].original_shapes
+    _update_stride_info(default_mm_info, a_shapes, b_shapes)
+
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        mm_info=default_mm_info,
+    )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=add_elem_types_to_mm_info(
+            mm_info=default_mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
+
+    return gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
+    )
+
+
 def gen_function_decl(func_attrs):
     func_name = func_attrs["name"]
     has_d = False
@@ -387,12 +667,14 @@ def gen_function(
     dim_info_dict,
     input_addr_calculator="",
     output_addr_calculator="",
+    problem_args_cutlass_3x="",
 ):
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=len(func_attrs["input_accessors"][0].original_shapes),
         weight_ndims=len(func_attrs["input_accessors"][1].original_shapes),
         output_ndims=len(func_attrs["output_accessors"][0].original_shapes),
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
deleted file mode 100644
index 213234342..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr.py
+++ /dev/null
@@ -1,161 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_crr, which computes A @ B + bias.
-A[ColMajor], B[RowMajor], bias[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "M",
-        "ldb": "N",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_crr.config")
-def bmm_crr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_crr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_crr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_crr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_crr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_crr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
deleted file mode 100644
index ce62a6a1e..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_crr_add.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_crr_add, which computes A @ B + bias + C.
-A[ColMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_crr, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_crr_add.config")
-def bmm_crr_add_config(func_attrs, dtype="float16"):
-    return bmm_crr.bmm_crr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_crr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = bmm_crr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_crr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = bmm_crr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_crr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_crr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_crr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
index 222522396..7fdfa98b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_permute_common.py
@@ -15,11 +15,15 @@
 """
 Common functions and templates for bmm_permute-family ops
 """
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common, common_bias
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
 
-from . import bmm_common, common_permute
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -94,13 +98,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
-            has_d=has_d,
             adims=a_dims_ptr,
             bdims=b_dims_ptr,
             cdims=c_dims_ptr,
@@ -126,12 +123,12 @@ def gen_profiler(
     func_call = bmm_common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
         bias_ptr=bias_ptr_arg,
-        c_ptr="c_ptr",
-        d_ptr="d_ptr",
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d_ptr="memory_pool->RequestTensorByIdx(%d)" % (4 if has_bias else 3),
         has_d=has_d,
         a_dims_ptr=benchmark_adims,
         b_dims_ptr=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
deleted file mode 100644
index c8afa49aa..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr.py
+++ /dev/null
@@ -1,227 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rcr, which computes A @ B + bias.
-A[RowMajor], B[ColMajor], bias[RowMajor]
-"""
-
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from . import bmm_common, common
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_default_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "K",
-        "ldb": "K",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_rcr.config")
-def bmm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
-
-
-@registry.reg("cuda.bmm_rcr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_default_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_rcr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "N * K"
-    input_b_stride_k_dim = "K"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                a_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 0
-                )
-
-                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                b_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 1
-                )
-                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                c_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.OUTPUT, 0
-                )
-                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="output_batch_stride",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        input_addr_calculator,
-        output_addr_calculator,
-    )
-
-
-@registry.reg("cuda.bmm_rcr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rcr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rcr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
index 17574b62e..14dee69a3 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rcr_permute.py
@@ -18,33 +18,43 @@
 A[RowMajor], B[ColMajor], bias[RowMajor]
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from . import bmm_common, bmm_permute_common, common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    bmm_permute_common,
+    common,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "N * K",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "0",
+    "lda": "K",
+    "ldb": "K",
+    "ldbias": "N",
+    "ldc": "N",
+}
+
+
 @registry.reg("cuda.bmm_rcr_permute.config")
 def bmm_rcr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
@@ -67,24 +77,16 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        bias_ptr="c_ptr",
-        a_batch_stride="M * K",
-        b_batch_stride="N * K",
-        bias_batch_stride="M * N",
-        c_batch_stride="0",
-        lda="K",
-        ldb="K",
-        ldbias="N",
-        ldc="N",
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes)
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+        mm_info=default_mm_info,
     )
 
     return bmm_permute_common.gen_profiler(
@@ -106,85 +108,17 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "N * K"
-    input_b_stride_k_dim = "K"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                input_a_batch_stride_dim = input_a_accessor.stride(0)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                input_b_batch_stride_dim = input_b_accessor.stride(0)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                output_batch_stride_dim = output_accessor.stride(0)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="0",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+    (
+        problem_args,
+        _,  # problem_args_cutlass_3x
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=True
     )
 
     return bmm_permute_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
deleted file mode 100644
index 489059f31..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rrr, which computes A @ B + bias.
-A[RowMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-def _get_problem_info(**kwargs):
-    problem_args = {
-        "bias_ptr": "c_ptr",
-        "a_batch_stride": "M * K",
-        "b_batch_stride": "N * K",
-        "bias_batch_stride": "M * N",
-        "c_batch_stride": "M * N",
-        "lda": "K",
-        "ldb": "N",
-        "ldbias": "N",
-        "ldc": "N",
-    }
-    for k, v in kwargs.items():
-        problem_args[k] = v
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(**problem_args)
-    return bmm_problem_info
-
-
-@registry.reg("cuda.bmm_rrr.config")
-def bmm_rrr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
-
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.RowMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
-        )
-
-    func_attrs["op_instance"] = common.extract_config(fproc)
-
-
-@registry.reg("cuda.bmm_rrr.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_rrr.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = _get_problem_info(
-        alpha_value=func_attrs.get("alpha", 1),
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_rrr.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rrr.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rrr.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
deleted file mode 100644
index 44fbda070..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_add.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-"""
-Codegen for bmm_rrr_add, which computes A @ B + bias + C.
-A[RowMajor], B[RowMajor], bias / C[RowMajor]
-"""
-
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, bmm_rrr, common
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-
-@registry.reg("cuda.bmm_rrr_add.config")
-def bmm_rrr_add_config(func_attrs, dtype="float16"):
-    return bmm_rrr.bmm_rrr_config(func_attrs, dtype)
-
-
-@registry.reg("cuda.bmm_rrr_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    a_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 0
-    )
-    b_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.INPUT, 1
-    )
-    c_dims = bmm_common.reverse_dim_info_mapping(
-        dim_info_dict, gemm_common.Source.OUTPUT, 0
-    )
-
-    args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
-        a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
-    )
-
-    mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
-    )
-
-
-@registry.reg("cuda.bmm_rrr_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    mm_info = bmm_rrr._get_problem_info(
-        bias_ptr="d_ptr",
-        alpha_value=func_attrs.get("alpha", 1),
-        beta_value=1,
-    )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    d_shapes = func_attrs["input_accessors"][2].original_shapes
-    bmm_common._update_stride_info(mm_info, a_shapes, b_shapes, d_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=mm_info,
-    )
-
-    return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-    )
-
-
-@registry.reg("cuda.bmm_rrr_add.func_decl")
-def gen_function_decl(func_attrs):
-    return bmm_common.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.bmm_rrr_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return bmm_common.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.bmm_rrr_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
index 40a69bd28..31551e49c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_rrr_permute.py
@@ -18,33 +18,43 @@
 A[RowMajor], B[RowMajor], bias / C[RowMajor]
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from . import bmm_common, bmm_permute_common, common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    bmm_permute_common,
+    common,
+    common_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+PROBLEM_ARGS = {
+    "bias_ptr": "c_ptr",
+    "a_batch_stride": "M * K",
+    "b_batch_stride": "K * N",
+    "bias_batch_stride": "M * N",
+    "c_batch_stride": "0",
+    "lda": "K",
+    "ldb": "N",
+    "ldbias": "N",
+    "ldc": "N",
+}
+
+
 @registry.reg("cuda.bmm_rrr_permute.config")
 def bmm_rrr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
@@ -67,24 +77,16 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
     )
 
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        bias_ptr="c_ptr",
-        a_batch_stride="M * K",
-        b_batch_stride="K * N",
-        bias_batch_stride="M * N",
-        c_batch_stride="0",
-        lda="K",
-        ldb="N",
-        ldbias="N",
-        ldc="N",
     )
     a_shapes = func_attrs["input_accessors"][0].original_shapes
     b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
+    bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes)
 
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+        mm_info=default_mm_info,
     )
 
     return bmm_permute_common.gen_profiler(
@@ -106,95 +108,17 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
 ):
-    backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
-
-    input_a_batch_stride_dim = "M * K"
-    input_a_stride_k_dim = "K"
-    input_a_offset = 0
-    input_b_batch_stride_dim = "K * N"
-    input_b_stride_k_dim = "N"
-    input_b_offset = 0
-
-    if "input_accessors" in func_attrs:
-        input_a_accessor = func_attrs["input_accessors"][0]
-        input_b_accessor = func_attrs["input_accessors"][1]
-
-        if input_a_accessor.is_from_strided_tensor:
-            input_a_offset = input_a_accessor.offset
-            if not input_a_accessor.is_contiguous:
-                a_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 0
-                )
-
-                input_a_batch_stride_dim = input_a_accessor.gen_stride_str(0, a_dims)
-                input_a_stride_k_dim = input_a_accessor.stride(1)
-
-        if input_b_accessor.is_from_strided_tensor:
-            input_b_offset = input_b_accessor.offset
-            if not input_b_accessor.is_contiguous:
-                b_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.INPUT, 1
-                )
-                input_b_batch_stride_dim = input_b_accessor.gen_stride_str(0, b_dims)
-                input_b_stride_k_dim = input_b_accessor.stride(1)
-
-    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
-        input_a_batch_stride_dim=input_a_batch_stride_dim,
-        input_a_stride_dim=input_a_stride_k_dim,
-        input_a_offset_val=input_a_offset,
-        input_b_batch_stride_dim=input_b_batch_stride_dim,
-        input_b_stride_dim=input_b_stride_k_dim,
-        input_b_offset_val=input_b_offset,
-    )
-
-    output_batch_stride_dim = "M * N"
-    output_stride_n_dim = "N"
-    output_offset = 0
-
-    if "output_accessors" in func_attrs:
-        output_accessor = func_attrs["output_accessors"][0]
-        if output_accessor.is_from_strided_tensor:
-            output_offset = output_accessor.offset
-            if not output_accessor.is_contiguous:
-                c_dims = bmm_common.reverse_dim_info_mapping(
-                    dim_info_dict, gemm_common.Source.OUTPUT, 0
-                )
-                output_batch_stride_dim = output_accessor.gen_stride_str(0, c_dims)
-                output_stride_n_dim = output_accessor.stride(1)
-
-    output_addr_calculator = bmm_common.OUTPUT_ADDR_CALCULATOR.render(
-        output_batch_stride_dim=output_batch_stride_dim,
-        output_stride_dim=output_stride_n_dim,
-        output_offset_val=output_offset,
-    )
-
-    bmm_problem_info = bmm_common.Bmm_problem_info(
+    default_mm_info = bmm_common.get_default_problem_info(
+        PROBLEM_ARGS,
         alpha_value=func_attrs.get("alpha", 1),
-        a_ptr="(" + elem_input_type + "*)(a_ptr) + input_a_offset",
-        b_ptr="(" + elem_input_type + "*)(b_ptr) + input_b_offset",
-        bias_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        c_ptr="(" + elem_output_type + "*)(c_ptr) + output_offset",
-        a_batch_stride="input_a_batch_stride",
-        b_batch_stride="input_b_batch_stride",
-        bias_batch_stride="output_batch_stride",
-        c_batch_stride="0",
-        lda="input_a_stride",
-        ldb="input_b_stride",
-        ldbias="output_stride",
-        ldc="output_stride",
     )
-    a_shapes = func_attrs["input_accessors"][0].original_shapes
-    b_shapes = func_attrs["input_accessors"][1].original_shapes
-    bmm_common._update_stride_info(bmm_problem_info, a_shapes, b_shapes)
-
-    problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
-        mm_info=bmm_problem_info,
+    (
+        problem_args,
+        _,  # problem_args_cutlass_3x
+        input_addr_calculator,
+        output_addr_calculator,
+    ) = bmm_common.make_function_strided_args(
+        func_attrs, dim_info_dict, default_mm_info, is_permute=True
     )
 
     return bmm_permute_common.gen_function(
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
index 742d601a0..8fad72cc9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_softmax_bmm_permute.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("cuda.bmm_softmax_bmm_permute.func_decl")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
new file mode 100644
index 000000000..8d92571b8
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
+
+"""
+Codegen for 8 bmm_xxx ops, which compute A @ B + bias. The ops differ in
+layouts of A, B, and bias: each can be column-major or row-major,
+8 combinations in total.
+
+This module registers functions config, gen_profiler, gen_function, func_decl,
+func_call, and filter for each layout combination under names like
+"cuda.bmm_rcr.func_call".
+"""
+
+
+def _get_problem_args(a_layout, b_layout, c_layout):
+    return {
+        "bias_ptr": "c_ptr",
+        "a_batch_stride": "M * K",
+        "b_batch_stride": "N * K",
+        "bias_batch_stride": "M * N",
+        "c_batch_stride": "M * N",
+        "lda": "M" if a_layout == "c" else "K",
+        "ldb": "K" if b_layout == "c" else "N",
+        "ldbias": "M" if c_layout == "c" else "N",
+        "ldc": "M" if c_layout == "c" else "N",
+        "a_row_major": a_layout == "r",
+        "b_row_major": b_layout == "r",
+        "c_row_major": c_layout == "r",
+    }
+
+
+def get_config(a_layout, b_layout, c_layout):
+    """
+    Return config function for given layouts of A, B, and bias.
+    """
+
+    def config(func_attrs, dtype="float16"):
+        import cutlass_lib
+
+        layout_choice = {
+            "c": cutlass_lib.library.LayoutType.ColumnMajor,
+            "r": cutlass_lib.library.LayoutType.RowMajor,
+        }
+
+        def fproc(op):
+            return common.default_fproc(
+                op=op,
+                a_layout=layout_choice[a_layout],
+                b_layout=layout_choice[b_layout],
+                c_layout=layout_choice[c_layout],
+                dtype=func_attrs["inputs"][0].dtype(),
+                epilogue_name=func_attrs["epilogue"],
+            )
+
+        func_attrs["op_instance"] = common.extract_config(
+            f_proc_op=fproc,
+            include_cutlass_3x_ops=True,
+        )
+
+    return config
+
+
+def get_gen_profiler(a_layout, b_layout, c_layout):
+    """
+    Return gen_profiler for given layouts of A, B, and bias.
+    """
+
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        return bmm_common.default_gen_profiler(
+            func_attrs,
+            workdir,
+            profiler_filename,
+            dim_info_dict,
+            problem_args,
+        )
+
+    return gen_profiler
+
+
+def get_gen_function(a_layout, b_layout, c_layout):
+    """
+    Return gen_function for given layouts of A, B, and bias.
+    """
+
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            alpha_value=func_attrs.get("alpha", 1),
+        )
+        (
+            problem_args,
+            problem_args_cutlass_3x,
+            input_addr_calculator,
+            output_addr_calculator,
+        ) = bmm_common.make_function_strided_args(
+            func_attrs=func_attrs,
+            dim_info_dict=dim_info_dict,
+            default_mm_info=default_mm_info,
+            is_permute=False,
+        )
+
+        return bmm_common.gen_function(
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            dim_info_dict=dim_info_dict,
+            input_addr_calculator=input_addr_calculator,
+            output_addr_calculator=output_addr_calculator,
+        )
+
+    return gen_function
+
+
+# Register functions for each of 8 layout combinations
+for a_layout in ["c", "r"]:
+    for b_layout in ["c", "r"]:
+        for c_layout in ["c", "r"]:
+            prefix = f"cuda.bmm_{a_layout}{b_layout}{c_layout}."
+
+            config = get_config(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "config")(config)
+
+            gen_profiler = get_gen_profiler(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_profiler")(gen_profiler)
+
+            gen_function = get_gen_function(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_function")(gen_function)
+
+            # The remaining 3 functions don't depend on the layout
+            registry.reg(prefix + "func_decl")(bmm_common.gen_function_decl)
+            registry.reg(prefix + "func_call")(bmm_common.gen_function_call)
+            registry.reg(prefix + "filter")(common.function_filter)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
new file mode 100644
index 000000000..a95edbc8f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/bmm_xxx_add.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Codegen for 8 bmm_xxx_add ops, which compute A @ B + bias + C. The ops differ
+in layouts of A, B, and C/bias: each can be column-major or row-major,
+8 combinations in total.
+
+This module registers functions config, gen_profiler, gen_function, func_decl,
+func_call, and filter for each layout combination under names like
+"cuda.bmm_rcr_add.func_call".
+"""
+
+
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
+from aitemplate.backend.cuda.gemm_universal.bmm_xxx import _get_problem_args, get_config
+
+
+def get_gen_function(a_layout, b_layout, c_layout):
+    """
+    Return gen_function for given layouts of A, B, and C/bias.
+    """
+
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            bias_ptr="d_ptr",
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=1,
+        )
+        (
+            problem_args,
+            problem_args_cutlass_3x,
+            input_addr_calculator,
+            output_addr_calculator,
+        ) = bmm_common.make_function_strided_args(
+            func_attrs=func_attrs,
+            dim_info_dict=dim_info_dict,
+            default_mm_info=default_mm_info,
+            is_permute=False,
+        )
+
+        return bmm_common.gen_function(
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            dim_info_dict=dim_info_dict,
+            input_addr_calculator=input_addr_calculator,
+            output_addr_calculator=output_addr_calculator,
+        )
+
+    return gen_function
+
+
+def get_gen_profiler(a_layout, b_layout, c_layout):
+    """
+    Return gen_profiler for given layouts of A, B, and C/bias.
+    """
+
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        a_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.INPUT, 0
+        )
+        b_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.INPUT, 1
+        )
+        c_dims = bmm_common.reverse_dim_info_mapping(
+            dim_info_dict, gemm_common.Source.OUTPUT, 0
+        )
+
+        args_parser = bmm_common.ARGS_PARSER_TEMPLATE.render(
+            a_dims=a_dims, b_dims=b_dims, c_dims=c_dims
+        )
+
+        problem_args = _get_problem_args(a_layout, b_layout, c_layout)
+        default_mm_info = bmm_common.get_default_problem_info(
+            problem_args,
+            bias_ptr="d_ptr",
+            alpha_value=func_attrs.get("alpha", 1),
+            beta_value=1,
+        )
+        a_shapes = func_attrs["input_accessors"][0].original_shapes
+        b_shapes = func_attrs["input_accessors"][1].original_shapes
+        d_shapes = func_attrs["input_accessors"][2].original_shapes
+        bmm_common._update_stride_info(default_mm_info, a_shapes, b_shapes, d_shapes)
+
+        problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
+            mm_info=default_mm_info,
+        )
+        problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+            mm_info=bmm_common.add_elem_types_to_mm_info(
+                mm_info=default_mm_info,
+                func_attrs=func_attrs,
+            ),
+        )
+
+        return bmm_common.gen_profiler(
+            func_attrs=func_attrs,
+            workdir=workdir,
+            profiler_filename=profiler_filename,
+            dim_info_dict=dim_info_dict,
+            src_template=common.SRC_TEMPLATE,
+            problem_args=problem_args,
+            problem_args_cutlass_3x=problem_args_cutlass_3x,
+            args_parser=args_parser,
+        )
+
+    return gen_profiler
+
+
+# Register functions for each of 8 layout combinations
+for a_layout in ["c", "r"]:
+    for b_layout in ["c", "r"]:
+        for c_layout in ["c", "r"]:
+            prefix = f"cuda.bmm_{a_layout}{b_layout}{c_layout}_add."
+
+            config = get_config(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "config")(config)
+
+            gen_profiler = get_gen_profiler(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_profiler")(gen_profiler)
+
+            gen_function = get_gen_function(a_layout, b_layout, c_layout)
+            registry.reg(prefix + "gen_function")(gen_function)
+
+            # The remaining 3 functions don't depend on the layout
+            registry.reg(prefix + "func_decl")(bmm_common.gen_function_decl)
+            registry.reg(prefix + "func_call")(bmm_common.gen_function_call)
+            registry.reg(prefix + "filter")(common.function_filter)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common.py b/python/aitemplate/backend/cuda/gemm_universal/common.py
index 9c18ab765..cda2b94ee 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common.py
@@ -25,12 +25,13 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend.backend_spec import CUDASpec
 
-from ...backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common, tensor_accessor_codegen
+from aitemplate.backend.target import Target
 
-from ...common import gemm_common, tensor_accessor_codegen
-from ...target import Target
+from aitemplate.compiler.base import IntImm
+from aitemplate.utils import alignment
 
 # pylint: disable=C0301,C0415,R1705
 
@@ -122,6 +123,12 @@
 """
 )
 
+INSTANCE_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+{{config}}
+using {{name}} = cutlass::gemm::device::GemmUniversalAdapter<{{config_name}}>;
+"""
+)
 
 SRC_TEMPLATE = jinja2.Template(
     """
@@ -129,6 +136,7 @@
 #include <memory>
 #include <random>
 #include <vector>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/kernel/gemm_grouped.h"
@@ -139,6 +147,15 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+using bfloat16 = nv_bfloat16;
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
@@ -213,13 +230,20 @@
 //  TODO: cast to right dtype
 {{indent}}using ElementComputeEpilogue = typename {{instance}}::ElementAccumulator;
 
-{{indent}}typename {{instance}}::Arguments arguments{
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
+{{indent}}typename {{instance}}::Arguments arguments;
 
+{{indent}}if constexpr (cutlass::gemm::detail::IsCutlass3GemmKernel<typename {{instance}}::GemmKernel>::value) {
+{{indent}}arguments = {
+{{problem_args_cutlass_3x}}
+{{indent}}};
+{{indent}}} else {
+{{indent}}arguments = {
 {{problem_args}}
-
 {{indent}}};
+{{indent}}}
+
 {% if is_profiler %}
-{{indent}}// https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
@@ -306,18 +330,7 @@
 {{indent}}ret = {{func_name}}(
 {{indent}}    {{gemm_op}},
 {{indent}}    gemm_op_name,
-{{indent}}    {{a_ptr}},
-{{indent}}    {{b_ptr}},
-{% if has_bias %}
-{{indent}}    {{bias_ptr}},
-{% endif %}
-{% if has_d %}
-{{indent}}    {{d_ptr}},
-{% endif %}
-{% if has_d1 %}
-{{indent}}    {{d1_ptr}},
-{% endif %}
-{{indent}}    {{c_ptr}},
+{{indent}}    memory_pool.get(),
 {{indent}}    global_workspace_,
 {% if support_split_k %}
 {{indent}}    {{split_k}},
@@ -350,18 +363,20 @@
 
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz;
+{% if has_bias %}
+  one_copy_sz += c_dim1;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
 
 {% if has_bias %}
   memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
 {% endif %}
-
 """
 )
 
@@ -371,8 +386,13 @@
     """
 size_t GLOBAL_WORKSPACE_SIZE = 0;
 
+#include <sstream>
+
 {{op_func}}
 
+template <typename DType>
+struct ProfilerMemoryPool;
+
 template <typename GemmInstance>
 int benchmark_{{function_name}} (
 {% if is_group_gemm %}
@@ -402,18 +422,7 @@
 
     GemmInstance &gemm_op,
     const char *gemm_op_name,
-    void* a_ptr,
-    void* b_ptr,
-{% if has_bias %}
-    void* bias_ptr,
-{% endif %}
-{% if has_d %}
-    void* d_ptr,
-{% endif %}
-{% if has_d1 %}
-    void* d1_ptr,
-{% endif %}
-    void* c_ptr,
+    ProfilerMemoryPool<{{elem_type}}>* memory_pool,
     uint8_t* global_workspace_,
 {% if support_split_k %}
     int split_k,
@@ -463,7 +472,7 @@
 
 template <typename DType>
 struct ProfilerMemoryPool {
-  ProfilerMemoryPool() {
+  ProfilerMemoryPool() : shared_input_tensor(false) {
     std::random_device rd;
     gen = std::mt19937(rd());
     uniform_dist = std::uniform_int_distribution<int64_t>(1, 48964896);
@@ -475,6 +484,49 @@
   }
   ~ProfilerMemoryPool() {}
 
+  int64_t ComputeMemPoolSize(size_t one_copy_sz, size_t ptr_max_sz, size_t l2_cache_bytes) {
+    int times_covers_l2_cache = (int)std::ceil(l2_cache_bytes / sizeof(DType) / ptr_max_sz);
+    int64_t mem_pool_sz = std::max(2, std::min(512, times_covers_l2_cache));
+    size_t free_global_mem = 0;
+    size_t total_global_mem = 0;
+    cudaError_t cuda_error = cudaMemGetInfo(&free_global_mem, &total_global_mem);
+    if (cuda_error != cudaSuccess) {
+      auto error_msg = std::string("Failed to invoke cudaMemGetInfo: ") +
+          cudaGetErrorName(cuda_error) + ", at " + __FILE__;
+      throw std::runtime_error(error_msg);
+    }
+    size_t single_copy_nbytes = one_copy_sz * sizeof(DType);
+    while (mem_pool_sz > 0) {
+      size_t nbytes = single_copy_nbytes * mem_pool_sz;
+      if (nbytes < free_global_mem) {
+        break;
+      }
+      mem_pool_sz--;
+    }
+
+    if (mem_pool_sz <= 1) {
+      size_t minimal_required_nbytes = ptr_max_sz * sizeof(DType);
+      if (minimal_required_nbytes > free_global_mem) {
+        // We absolutely run out of memory
+        auto error_msg = std::string("no enough GPU memory: requested ") +
+            std::to_string(minimal_required_nbytes) + ", available: " +
+            std::to_string(free_global_mem) + ", ptr_max_sz: " +
+            std::to_string(ptr_max_sz) + ", at " + __FILE__;
+        throw std::runtime_error(error_msg);
+      } else {
+        // Let's try to allocate a single blob that is large enough to hold
+        // all input tensors. Note that this is still an approximation, because
+        // we may still hit cudaErrorMemoryAllocation error while allocating
+        // memory for the output. We will rely on cudaMalloc to throw out
+        // an exception in such a case.
+        shared_input_tensor = true;
+        AllocateGaussianTensor(ptr_max_sz);
+      }
+      return 1;
+    }
+    return mem_pool_sz;
+  }
+
   DType* AllocateGaussianTensor(int64_t size) {
     size_t length = size * sizeof(DType);
     blobs.emplace_back(length);
@@ -490,12 +542,16 @@
     return ptr;
   }
 
-
-  int AllocateTensor(int64_t size, int64_t copy) {
+  int AllocateTensor(int64_t size, int64_t copy, bool is_output = false) {
     offsets.push_back(0);
     strides.push_back(size);
     copies.push_back(copy);
-    auto ptr = AllocateGaussianTensor(size * copy);
+    DType *ptr;
+    if (!is_output && shared_input_tensor) {
+      ptr = reinterpret_cast<DType*>(blobs.back().get());
+    } else {
+      ptr = AllocateGaussianTensor(size * copy);
+    }
     ptrs.push_back(reinterpret_cast<void*>(ptr));
     return ptrs.size() - 1;
   }
@@ -521,6 +577,9 @@
   std::vector<cutlass::DeviceAllocation<uint8_t> > blobs;
   std::mt19937 gen;
   std::uniform_int_distribution<int64_t> uniform_dist;
+  // make a shared blob to hold all inputs in cases we do not have
+  // enough GPU memory
+  bool shared_input_tensor;
 };
 
 
@@ -530,13 +589,21 @@
   cudaError_t result = cudaGetDevice(&device_idx);
   auto memory_pool = std::make_unique<ProfilerMemoryPool<{{elem_type}}>>();
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDevice() API call failed.");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDevice() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   result = cudaGetDeviceProperties(&device_properties, device_idx);
 
   if (result != cudaSuccess) {
-    throw std::runtime_error("cudaGetDeviceProperties() failed");
+    std::ostringstream errorStream;
+    errorStream << "cudaGetDeviceProperties() call failed! "
+                << "Error code: " << cudaGetErrorName(result)
+                << " Error message: " << cudaGetErrorString(result);
+    throw std::runtime_error(errorStream.str());
   }
 
   {{args_parse}}
@@ -555,7 +622,7 @@
 
 KERNEL_KEY_TEMPLATE = jinja2.Template(
     """
-cutlass_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
+cutlass{{prefix}}_{{opcode_class_name}}_{{extended_name}}_{{threadblock}}_{{layout}}_align_{{align_ab}}_{{align_c}}
 """
 )
 
@@ -596,6 +663,27 @@ def get_gemm_instance_template_params(
     return gemm_universal_params
 
 
+def get_tensor_accessor_alignments(func_attrs):
+    """Infer the A, B, and epilogue alignments from the respective TAs."""
+    input_accessors = func_attrs["input_accessors"]
+    a_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[0]
+    )
+    b_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        input_accessors[1]
+    )
+    output_accessor = func_attrs["output_accessors"][0]
+    epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
+        output_accessor
+    )
+
+    # if the last dim is dynamic, force align=1
+    if not isinstance(output_accessor.original_shapes[-1], IntImm):
+        epilogue_alignment = 1
+
+    return a_alignment, b_alignment, epilogue_alignment
+
+
 def update_alignments_in_gemm_instance(
     op_def: str,
     func_attrs: Dict[str, Any],
@@ -613,22 +701,10 @@ def update_alignments_in_gemm_instance(
     if for_profiler:
         return op_def
 
-    input_accessors = func_attrs["input_accessors"]
-    a_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        input_accessors[0]
-    )
-    b_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        input_accessors[1]
-    )
-    output_accessor = func_attrs["output_accessors"][0]
-    epilogue_alignment = tensor_accessor_codegen.find_max_alignment_for_accessor(
-        output_accessor
+    a_alignment, b_alignment, epilogue_alignment = get_tensor_accessor_alignments(
+        func_attrs
     )
 
-    # if the last dim is dynamic, force align=1
-    if not isinstance(output_accessor.original_shapes[-1], IntImm):
-        epilogue_alignment = 1
-
     gemm_params = get_gemm_instance_template_params(op_def, kernel_config)
     epilogue_align_idx = 11
     a_align_idx = 17
@@ -659,8 +735,19 @@ def _replace_align(align_idx, curr_align, alignment):
 
 
 def universal_gemm_instance(
-    op_def: str, func_attrs: Dict[str, Any], for_profiler: bool
+    op_def: str,
+    func_attrs: Dict[str, Any],
+    for_profiler: bool,
+    cutlass_3x: bool = False,
 ) -> str:
+    if cutlass_3x:
+        # We don't need to make any adjustments to the emitted
+        # CUTLASS 3.x op definitions. In particular, the alignments
+        # should not be updated, as the op instances incompatible
+        # with the TA-specified alignments have been removed from
+        # consideration by the filter_cutlass_3x_ops function.
+        return op_def
+
     op_def = update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
     tmp = op_def.replace(
         "cutlass::gemm::device::Gemm", "cutlass::gemm::device::GemmUniversal"
@@ -681,7 +768,13 @@ def kernel_name(op):
     layout = op.layout_name()
     align_ab = op.A.alignment
     align_c = op.C.alignment
+    prefix = ""
+    if op.prefix != "":
+        kernel_schedule = library.KernelScheduleSuffixes[op.kernel_schedule]
+        epilogue_schedule = library.EpilogueScheduleSuffixes[op.epilogue_schedule]
+        prefix = f"{op.prefix}{kernel_schedule}{epilogue_schedule}"
     name = KERNEL_KEY_TEMPLATE.render(
+        prefix=prefix,
         threadblock=threadblock,
         extended_name=extended_name,
         opcode_class_name=opcode_class_name,
@@ -701,25 +794,42 @@ def emit_instance(
 ):
     import cutlass_lib
 
-    emitter = cutlass_lib.gemm_operation.EmitGemmInstance()
-    if emit_kernel:
-        emitter = cutlass_lib.gemm_operation.EmitGemmUniversalInstance()
+    cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+    if cutlass_3x:
+        emitter = cutlass_lib.gemm_operation.EmitGemmUniversal3xInstance()
+    else:
+        emitter = cutlass_lib.gemm_operation.EmitGemmInstance()
+        if emit_kernel:
+            emitter = cutlass_lib.gemm_operation.EmitGemmUniversalInstance()
+
     op_def = emitter.emit(op)
-    op_def = f_instance_convertor(op_def, func_attrs, for_profiler)
+    op_def = f_instance_convertor(
+        op_def=op_def,
+        func_attrs=func_attrs,
+        for_profiler=for_profiler,
+        cutlass_3x=cutlass_3x,
+    )
+
     return op_def
 
 
-def extract_config(f_proc_op, f_kernel_name=kernel_name):
+def extract_config(
+    f_proc_op,
+    f_kernel_name=kernel_name,
+    include_cutlass_3x_ops=False,
+):
     import cutlass_lib
 
     op_kind = cutlass_lib.library.OperationKind.Gemm
-    gemm_kind = cutlass_lib.library.GemmKind.Universal
+    gemm_kinds = {cutlass_lib.library.GemmKind.Universal}
+    if include_cutlass_3x_ops:
+        gemm_kinds.add(cutlass_lib.library.GemmKind.Universal3x)
     gemm_ops = OrderedDict()
     extract_ops = list(Target.current()._operators[op_kind].items())
 
     for _, value in extract_ops:
         op = value[0]
-        if op.gemm_kind == gemm_kind:
+        if op.gemm_kind in gemm_kinds:
             ret = f_proc_op(op)
             if len(ret) > 0:
                 for op_inst in ret:
@@ -728,9 +838,16 @@ def extract_config(f_proc_op, f_kernel_name=kernel_name):
     return gemm_ops
 
 
-def extract_config_name(config):
-    pattern = re.compile(r"\s*using\s(.*?)\s=")
-    decl = config.split("\n")[2]
+def extract_config_name(
+    config,
+    cutlass_3x=False,
+):
+    if cutlass_3x:
+        pattern = re.compile(r"\s*struct\s(.*?)\s:")
+        decl = [line for line in config.split("\n") if "struct " in line][-1]
+    else:
+        pattern = re.compile(r"\s*using\s(.*?)\s=")
+        decl = config.split("\n")[2]
     match = pattern.match(decl)
     if match is None:
         raise RuntimeError("Invalid config: \n" + config)
@@ -752,6 +869,7 @@ def gen_function(
     input_addr_calculator="",
     output_addr_calculator="",
     extra_code="",
+    problem_args_cutlass_3x="",
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -766,9 +884,11 @@ def gen_function(
     inst_def_flag = set()
     instances = {}
     instance_decl = ""
+    exec_cond_to_cutlass_3x = {}
     for exec_item in exec_path.values():
         fname = "f" + sha1(exec_item.exec_cond.encode()).hexdigest()
         algo = exec_item.algo
+        cutlass_3x = algo.startswith("cutlass3x")
         if algo not in inst_def_flag:
             config = emit_instance(
                 op_instance[algo],
@@ -780,25 +900,43 @@ def gen_function(
             inst_def_flag.add(algo)
         else:
             config = ""
-        inst = INSTANCE_TEMPLATE.render(
-            config=config, name=fname, config_name=extract_config_name(config)
+        instance_template = (
+            INSTANCE_TEMPLATE_CUTLASS_3X if cutlass_3x else INSTANCE_TEMPLATE
+        )
+        inst = instance_template.render(
+            config=config,
+            name=fname,
+            config_name=extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
         )
         instances[exec_item.exec_cond] = inst
+        exec_cond_to_cutlass_3x[exec_item.exec_cond] = cutlass_3x
         instance_decl += inst
     shape_eval_func = gemm_common.gen_shape_eval_code(
         indent=1, dtype="int64_t", dim_info_dict=dim_info_dict, is_ptr=True
     )
 
     exec_paths = ""
-    for key in instances:
-        fname = "f" + sha1(key.encode()).hexdigest()
+    for exec_cond in instances:
+        fname = "f" + sha1(exec_cond.encode()).hexdigest()
+        cutlass_3x = exec_cond_to_cutlass_3x[exec_cond]
         program = EXEC_TEMPLATE.render(
             indent="    ",
             instance=fname,
-            problem_args=problem_args,
+            # need to omit irrelevant problem_args here as in
+            # non-templated function both CUTLASS 2.x and 3.x
+            # code branches are syntactically checked
+            problem_args=(problem_args if not cutlass_3x else ""),
+            problem_args_cutlass_3x=(problem_args_cutlass_3x if cutlass_3x else ""),
             support_split_k=support_split_k,
         )
-        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(
+            indent="  ",
+            cond=exec_cond,
+            program=program,
+        )
         exec_paths += exec_inst
     input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=input_ndims,
@@ -841,13 +979,101 @@ def add_profiler(file_pairs, workdir, op_type, output_name, code):
     prefix = os.path.join(workdir, "profiler", op_type)
     if not os.path.exists(prefix):
         os.makedirs(prefix)
-    src_path = os.path.join(prefix, output_name + ".cu")
+
     obj_path = os.path.join(prefix, output_name)
     if os.path.exists(obj_path):
         return
-    with open(src_path, "w") as f:
-        f.write(code)
-    file_pairs.append((src_path, obj_path))
+
+    if isinstance(code, dict):
+        # multi-source profiler
+        src_paths = []
+        for src_name, src_code in code.items():
+            # create each source file separately
+            src_path = os.path.join(prefix, src_name + ".cu")
+            with open(src_path, "w") as f:
+                f.write(src_code)
+            src_paths.append(src_path)
+        # add multiple src paths to file_pairs
+        file_pairs.append((src_paths, obj_path))
+    else:
+        # single-source profiler
+        src_path = os.path.join(prefix, output_name + ".cu")
+        with open(src_path, "w") as f:
+            f.write(code)
+        # add single src path to file_pairs
+        file_pairs.append((src_path, obj_path))
+
+
+def has_tma_epilogue(op):
+    """Check whether the op is CUTLASS 3.x and has a TMA epilogue schedule."""
+    import cutlass_lib
+
+    result = False
+    if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+        epilogue_schedule_str = str(op.epilogue_schedule).split(".")[-1]
+        result = epilogue_schedule_str.lower().startswith("tma")
+
+    return result
+
+
+def filter_cutlass_3x_ops(op_instance, func_attrs):
+    """Filter out CUTLASS 3.x ops with incompatible alignment requirements.
+
+    The CUTLASS 3.x ops have stricter alignment requirements compared to
+    the CUTLASS 2.x ops (due to TMA). These alignment requirements are used
+    to initially filter them out in the `function_filter` below. However, the
+    required alignments of the GEMM op inputs and outputs may change due to
+    TensorAccessor-related optimizations, which are introduced to the model
+    graph *after* the initial filtering.
+
+    In this function, the (possible) TA-related alignment updates are checked
+    once again and the CUTLASS 3.x ops not satisfying these requirements are
+    filtered out. Importantly, due to input/output alignment flexibilit of the
+    CUTLASS 2.x ops, their alignment requirements are corrected using the
+    TA-imposed alignments in the `update_alignments_in_gemm_instance` function
+    above. But this correction is not possible for the CUTLASS 3.x ops, as they
+    won't work with the lower alignment values. That's why the CUTLASS 3.x ops
+    are filtered out by this function in such cases.
+    """
+    import cutlass_lib
+
+    a_alignment, b_alignment, epilogue_alignment = get_tensor_accessor_alignments(
+        func_attrs
+    )
+
+    result_2x, result_3x = {}, {}
+    for op_name, op in op_instance.items():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            if (
+                op.A.alignment <= a_alignment
+                and op.B.alignment <= b_alignment
+                and op.C.alignment <= epilogue_alignment
+            ):
+                result_3x[op_name] = op
+        else:
+            result_2x[op_name] = op
+
+    has_ops_with_tma_epilogue = False
+    if result_3x:
+        for op in result_3x.values():
+            if has_tma_epilogue(op):
+                has_ops_with_tma_epilogue = True
+                break
+
+        if has_ops_with_tma_epilogue:
+            # when there are ops with TMA epilogue, keep only those
+            # for better performance / shorter profiler compilation time
+            result_3x = {
+                op_name: op for op_name, op in result_3x.items() if has_tma_epilogue(op)
+            }
+
+    return {
+        # CUTLASS 3.x kernels can cause power throttling:
+        # we want to generate the 2.x kernels first to avoid
+        # performance side effects caused by the 3.x kernels
+        **result_2x,
+        **result_3x,
+    }, has_ops_with_tma_epilogue
 
 
 def gen_profiler(
@@ -862,9 +1088,14 @@ def gen_profiler(
     output_addr_calculator="",
     bias_ptr_arg=None,
     extra_code="",
+    problem_args_template_cutlass_3x=None,
 ):
+    import cutlass_lib
+
     op_type = func_attrs["op"]
     op_instance = func_attrs["op_instance"]
+    op_instance, op_has_tma_epilogue = filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -894,6 +1125,15 @@ def gen_profiler(
             elem_input_type=elem_input_type,
             elem_output_type=elem_output_type,
         ),
+        problem_args_cutlass_3x=(
+            problem_args_template_cutlass_3x.render(
+                elem_input_type=elem_input_type,
+                elem_output_type=elem_output_type,
+                has_tma_epilogue=op_has_tma_epilogue,
+            )
+            if problem_args_template_cutlass_3x is not None
+            else ""
+        ),
     )
     input_output_checks = INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=ndims,
@@ -906,11 +1146,19 @@ def gen_profiler(
     benchmark_instances = []
     for instance_idx, (op_name, op) in enumerate(op_instance.items()):
         config = emit_instance(op, for_profiler=True)
-        config_name = extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            INSTANCE_TEMPLATE_CUTLASS_3X if cutlass_3x else INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -918,11 +1166,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -957,11 +1200,11 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
@@ -1012,11 +1255,12 @@ def gen_local_dim_defs(func_attrs, indent="  "):
             # skip dynamic dims
             if isinstance(dim, IntImm):
                 input_shape = func_attrs["inputs"][input_idx]._attrs["shape"]
-                name = input_shape[idx]._attrs["name"]
-                if name in dims:
-                    assert dims[name] == dim.value(), "bmm inputs shape mismatch"
-                else:
-                    dims[name] = dim.value()
+                if idx < len(input_shape):
+                    name = input_shape[idx]._attrs["name"]
+                    if name in dims:
+                        assert dims[name] == dim.value(), "bmm inputs shape mismatch"
+                    else:
+                        dims[name] = dim.value()
     return DIM_DEFS_TEMPLATE.render(dims=dims, indent=indent)
 
 
@@ -1050,54 +1294,113 @@ def gen_function_call(func_attrs, indent="  ", bias_ptr_arg=None):
 
 
 def default_fproc(
-    *, op, a_layout, b_layout, c_layout, elem_type, epiligue_name, permute_layout=None
+    *, op, a_layout, b_layout, c_layout, dtype, epilogue_name, permute_layout=None
 ):
     import copy
 
     import cutlass_lib
 
+    backend_spec = CUDASpec()
+
     ret = []
-    data_type = elem_type
+    # skip simt kernels
+    if (
+        op.tile_description.math_instruction.opcode_class
+        == cutlass_lib.library.OpcodeClass.Simt
+    ):
+        return ret
+    data_type = backend_spec.dtype_to_lib_type(dtype)
+    if data_type == "float":
+        if (
+            op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.f32
+            and op.tile_description.math_instruction.element_a
+            != cutlass_lib.library.DataType.tf32
+        ):
+            return ret
     acc_type = cutlass_lib.library.DataType.f32
     # check target use fp16 acc
     if "use_fp16_acc" in Target.current()._kwargs and data_type == "cutlass::half_t":
         if Target.current()._kwargs["use_fp16_acc"]:
             acc_type = cutlass_lib.library.DataType.f16
+
+    # For column-major C layouts, filter out GEMM tiling configs introducted by
+    # extra_cutlass_generator.py - those will cause a build error.
+    threadblock_mxn = op.tile_description.threadblock_shape[:2]
+    is_nonstandard_theadblock_shape = threadblock_mxn == [128, 32]
+    filter_extra_tile_configs = (
+        is_nonstandard_theadblock_shape
+        and c_layout == cutlass_lib.library.LayoutType.ColumnMajor
+    )
+
     if (
         cutlass_lib.library.DataTypeTag[op.A.element] == data_type
         and cutlass_lib.library.DataTypeTag[op.B.element] == data_type
         and cutlass_lib.library.DataTypeTag[op.C.element] == data_type
+        and cutlass_lib.library.DataTypeTag[op.D.element] == data_type
         and op.accumulator_type() == acc_type
         and op.A.layout == a_layout
         and op.B.layout == b_layout
+        and not filter_extra_tile_configs
     ):
         op = copy.deepcopy(op)
+
         # set output major
         op.C.layout = c_layout
+        op.D.layout = c_layout
+
         # set epilogue
-        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epiligue_name]
+        op.epilogue_functor = cutlass_lib.library.EpilogueFunctorName[epilogue_name]
         op.element_epilogue = acc_type
+        if (
+            op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+            and op.epilogue_functor
+            != cutlass_lib.library.EpilogueFunctor.LinearCombination
+        ):
+            # need to substitute the epilogue schedule with
+            # the one parameterized by the epilogue functor
+            if op.epilogue_schedule in (
+                cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecialized,
+                cutlass_lib.library.EpilogueScheduleType.TmaWarpSpecializedCooperative,
+            ):
+                op.epilogue_schedule = cutlass_lib.library.EpilogueScheduleMapping[
+                    op.epilogue_schedule
+                ][op.epilogue_functor]
+            else:
+                # epilogue functor parameterization unavailable
+                # for the rest of epilogue schedule types
+                return ret
+
+        # set permute layout
         if permute_layout is not None:
             op.permute_layout = cutlass_lib.library.EpiloguePermuteLayoutName[
                 permute_layout
             ]
-        # set C alignment
-        for i in [8, 4, 2, 1]:
+
+        # set C and D alignment
+        alignments = alignment.get_alignments(dtype)
+        for i in alignments:
+            if has_tma_epilogue(op) and i != max(alignments):
+                # TMA epilogues only support max. output alignment
+                continue
             op = copy.deepcopy(op)
             op.C.alignment = i
+            op.D.alignment = i
             ret.append(op)
+
     return ret
 
 
-def make_fproc(func_attrs, layout):
+def make_fproc(
+    func_attrs,
+    layout,
+    include_cutlass_3x_ops=False,
+):
     """
     This function sets a callback for processing the epilogue of the kernel
     associated with func_attrs.
     """
 
-    backend_spec = CUDASpec()
-    elem_type = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
-
     def fproc(op):
         a_layout, b_layout, c_layout = layout.cutlass_lib_layouts()
         return default_fproc(
@@ -1105,11 +1408,14 @@ def fproc(op):
             a_layout=a_layout,
             b_layout=b_layout,
             c_layout=c_layout,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = extract_config(fproc)
+    func_attrs["op_instance"] = extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=include_cutlass_3x_ops,
+    )
 
 
 def function_filter(cfg, func_attrs, ab_alignment):
@@ -1129,6 +1435,8 @@ def function_filter(cfg, func_attrs, ab_alignment):
     bool
         If input cfg should be filtered.
     """
+    # example:
+    # cfg="cutlass_tensorop_f16_s16816gemm_f16_128x32_64x4_nn_align_8_8"
     tmp = cfg.split("_")
     align_c = int(tmp[-1])
     align_ab = int(tmp[-2])
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
index 2d4e7f05a..a464d50d0 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias.py
@@ -35,6 +35,8 @@
 #include <random>
 #include <vector>
 #include <iostream>
+#include <cuda_bf16.h>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
@@ -43,6 +45,15 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+using bfloat16 = nv_bfloat16;
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
index bd7e437e4..e8c7af5b6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_activation.py
@@ -16,16 +16,51 @@
 """
 Common codegen functions for gemm_bias_activation.
 """
+import jinja2
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias, gemm_rcr
-from .layout import RCR
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
+EXTRA_CODE_HEADER = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
+def gemm_rcr_config(
+    func_attrs,
+    dtype="float16",
+    include_cutlass_3x_ops=False,
+):
+    common.make_fproc(
+        func_attrs=func_attrs,
+        layout=RCR,
+        include_cutlass_3x_ops=include_cutlass_3x_ops,
+    )
+
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if common.has_tma_epilogue(op):
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
+
+            # swap the output layout to the transposed problem
+            op.C.layout = cutlass_lib.library.LayoutType.ColumnMajor
+            op.D.layout = cutlass_lib.library.LayoutType.ColumnMajor
+
+            # switch to a TMA epilogue with bias
+            op.epilogue_schedule = (
+                cutlass_lib.library.EpilogueScheduleBiasElementwiseMapping[
+                    op.epilogue_schedule
+                ]
+            )
 
 
 def gen_profiler(
@@ -34,17 +69,30 @@ def gen_profiler(
     profiler_filename,
     dim_info_dict,
     problem_args_template,
+    problem_args_template_cutlass_3x=None,
     extra_code="",
 ):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code_header = EXTRA_CODE_HEADER.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return gemm_rcr.common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args_template,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
-        extra_code=extra_code,
+        extra_code="\n\n".join([extra_code_header, extra_code]),
     )
 
 
@@ -53,6 +101,7 @@ def gen_function(
     problem_args_template,
     exec_cond_template,
     dim_info_dict,
+    problem_args_template_cutlass_3x=None,
     extra_code="",
 ):
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
@@ -69,21 +118,32 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = ""
+    if problem_args_template_cutlass_3x is not None:
+        problem_args_cutlass_3x = problem_args_template_cutlass_3x.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+        )
+    extra_code_header = EXTRA_CODE_HEADER.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N",
             output_accessor=func_attrs["output_accessors"][0],
         ),
-        extra_code=extra_code,
+        extra_code="\n\n".join([extra_code_header, extra_code]),
     )
 
 
@@ -102,5 +162,7 @@ def gen_function_decl(func_attrs):
 def gen_function_call(func_attrs, indent="  "):
     bias = func_attrs["inputs"][2]
     return common.gen_function_call(
-        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+        func_attrs=func_attrs,
+        indent=indent,
+        bias_ptr_arg=bias._attrs["name"],
     )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
index 42564bc0c..fb5e9ef38 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_bias_broadcast.py
@@ -22,11 +22,11 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ...target import Target
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
 
-from . import common, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal import common, gemm_rcr
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -35,9 +35,9 @@
 GEMM_UNIVERSAL_WITH_BROADCAST_TEMPLATE = jinja2.Template(
     """
     cutlass::gemm::device::GemmUniversalWithBroadcast<
-        cutlass::half_t, {{layout.cutlass_layout_a}},
-        cutlass::half_t, {{layout.cutlass_layout_b}},
-        cutlass::half_t, {{layout.cutlass_layout_c}},
+        {{elem_type}}, {{layout.cutlass_layout_a}},
+        {{elem_type}}, {{layout.cutlass_layout_b}},
+        {{elem_type}}, {{layout.cutlass_layout_c}},
         {{acc_type}},
         cutlass::arch::OpClassTensorOp,
         {{arch}},
@@ -45,8 +45,8 @@
         {{warp_shape}},
         {{instruction_shape}},
         {{epilogue_functor}}<
-            cutlass::half_t, {{acc_type}}, {{acc_type}},
-            cutlass::half_t, {{epilogue_vector_length}},
+            {{elem_type}}, {{acc_type}}, {{acc_type}},
+            {{elem_type}}, {{epilogue_vector_length}},
             {{unary_op1}}, {{binary_op1}}, {{unary_op2}}
 {% if has_d1 %}
             , {{binary_op2}}
@@ -60,89 +60,179 @@
 """
 )
 
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+EPILOGUE_TENSOR_BROADCAST_TEMPLATE = jinja2.Template(
+    """
+using {{epilogue_name}} =
+  cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
+    cutlass::epilogue::collective::EpilogueTensorBroadcast<
+      cutlass::gemm::TagToStrideC_t<cutlass::layout::LayoutTranspose<{{layout_c}}>::type>,
+      cutlass::gemm::TagToStrideC_t<cutlass::layout::LayoutTranspose<{{layout_d}}>::type>,
+      cutlass::epilogue::thread::LinearCombinationTensorBroadcast<
+        {{element_d}}, {{element_accumulator}}, {{element_compute}}, {{element_c}},
+        {{unary_op1}},
+        {{binary_op1}},
+{% if has_d1 %}
+        {{binary_op2}},
+{% else %}
+        cutlass::epilogue::thread::detail::NoOp,
+{% endif %}
+        {{unary_op2}}
+        >,
+      {{epilogue_schedule}}>>;
+"""
+)
+
 # For func codegen.
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.k}})
+    },                                                       // GemmCoord problem_size
 {% if support_split_k %}
-    split_k,
+    split_k,                                                 // int batch_count
 {% else %}
-    1,
+    1,                                                       // int batch_count
 {% endif %}
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_output_type}}*)(d0_ptr),
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_output_type}}*)(d0_ptr),                         // void const * ptr_C1
 {% if has_d1 %}
-    ({{elem_output_type}}*)(d1_ptr),
-{% else %}
-    nullptr,
+    ({{elem_output_type}}*)(d1_ptr),                         // void const * ptr_C2
 {% endif %}
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    ({{elem_input_type}}*) (bias_ptr),
-    nullptr,
-    /*batch_stride_A*/ input_a_batch_stride,
-    /*batch_stride_B*/ input_b_batch_stride,
-    /*batch_stride_C1*/ 0,
-    /*batch_stride_C2*/ 0,
-    /*batch_stride_D*/ 0,
-    /*batch_stride_Vector*/ 0,
-    /*batch_stride_Tensor*/ 0,
-    input_a_stride,
-    input_b_stride,
-    {{layout.stride_c}},
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    ({{elem_input_type}}*) (bias_ptr),                       // void * ptr_Vector
+    nullptr,                                                 // void * ptr_Tensor
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    0,                                                       // int64_t batch_stride_C1
 {% if has_d1 %}
-    {{layout.stride_c}},
+    0,                                                       // int64_t batch_stride_C2
+{% endif %}
+    0,                                                       // int64_t batch_stride_D
+    0,                                                       // int64_t batch_stride_Vector
+    0,                                                       // int64_t batch_stride_Tensor
+    input_a_stride,                                          // typename LayoutA::Stride::Index lda
+    input_b_stride,                                          // typename LayoutB::Stride::Index ldb
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc1
+{% if has_d1 %}
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc2
+{% endif %}
+    output_stride,                                           // typename LayoutC::Stride::Index ldd
+    0,                                                       // typename LayoutC::Stride::Index ldr
+    0,                                                       // typename LayoutC::Stride::Index ldt
+"""
+)
+
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.k}}),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementA const* ptr_A
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementB const* ptr_B
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        {cute::Int<1>{}, {{layout.stride_c}}, cute::Int<0>{}},   // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias* ptr_Bias
+        ({{elem_output_type}}*)(d0_ptr),                         // ElementC* ptr_C0
+{% if has_d1 %}
+        ({{elem_output_type}}*)(d1_ptr),                         // ElementC* ptr_C1
 {% else %}
-    0,
+        nullptr,
 {% endif %}
-    output_stride,
-    /*ldr*/ 0,
-    /*/ldt*/ 0
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    { {{layout.m}}, {{layout.n}}, {{layout.k}} },
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.k}})
+    },                                                       // GemmCoord problem_size
 {% if support_split_k %}
-    split_k,
+    split_k,                                                 // int batch_count
 {% else %}
-    1,
+    1,                                                       // int batch_count
 {% endif %}
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*) a_ptr,
-    ({{elem_input_type}}*) b_ptr,
-    ({{elem_output_type}}*) d0_ptr,
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*) a_ptr,                            // void const * ptr_A
+    ({{elem_input_type}}*) b_ptr,                            // void const * ptr_B
+    ({{elem_output_type}}*) d0_ptr,                          // void const * ptr_C1
 {% if has_d1 %}
-    ({{elem_output_type}}*) d1_ptr,
-{% else %}
-    nullptr,
+    ({{elem_output_type}}*) d1_ptr,                          // void const * ptr_C2
 {% endif %}
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    ({{elem_input_type}}*) bias_ptr,
-    nullptr,
-    /*batch_stride_A*/ 0,
-    /*batch_stride_B*/ 0,
-    /*batch_stride_C1*/ 0,
-    /*batch_stride_C2*/ 0,
-    /*batch_stride_D*/ 0,
-    /*batch_stride_Vector*/ 0,
-    /*batch_stride_Tensor*/ 0,
-    {{layout.stride_a}},
-    {{layout.stride_b}},
-    {{layout.stride_c}},
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    ({{elem_input_type}}*) bias_ptr,                         // void * ptr_Vector
+    nullptr,                                                 // void * ptr_Tensor
+    0,                                                       // int64_t batch_stride_A
+    0,                                                       // int64_t batch_stride_B
+    0,                                                       // int64_t batch_stride_C1
 {% if has_d1 %}
-    {{layout.stride_c}},
+    0,                                                       // int64_t batch_stride_C2
+{% endif %}
+    0,                                                       // int64_t batch_stride_D
+    0,                                                       // int64_t batch_stride_Vector
+    0,                                                       // int64_t batch_stride_Tensor
+    {{layout.stride_a}},                                     // typename LayoutA::Stride::Index lda
+    {{layout.stride_b}},                                     // typename LayoutB::Stride::Index ldb
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc1
+{% if has_d1 %}
+    {{layout.stride_c}},                                     // typename LayoutC::Stride::Index ldc2
+{% endif %}
+    output_stride,                                           // typename LayoutC::Stride::Index ldd
+    0,                                                       // typename LayoutC::Stride::Index ldr
+    0,                                                       // typename LayoutC::Stride::Index ldt
+"""
+)
+
+# we use the transposed problem with swapped A and B
+# operands and column-major C and D in CUTLASS 3.x
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>({{layout.n}}),
+        static_cast<coord_t>({{layout.m}}),
+        static_cast<coord_t>({{layout.k}}),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_A
+    { {{layout.stride_b}}, cute::Int<1>{}, cute::Int<0>{}},      // StrideB dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_B
+    { {{layout.stride_a}}, cute::Int<1>{}, cute::Int<0>{}},      // StrideA dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        {cute::Int<1>{}, {{layout.stride_c}}, cute::Int<0>{}},   // StrideC dC
+        ({{elem_output_type}}*)(c_ptr),                          // ElementD* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias* ptr_Bias
+        ({{elem_output_type}}*)(d0_ptr),                         // ElementC* ptr_C0
+{% if has_d1 %}
+        ({{elem_output_type}}*)(d1_ptr),                         // ElementC* ptr_C1
 {% else %}
-    0,
+        nullptr,
 {% endif %}
-    output_stride,
-    /*ldr*/ 0,
-    /*/ldt*/ 0
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
@@ -153,14 +243,25 @@
 #include <random>
 #include <vector>
 
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
-#include "cutlass/epilogue/thread/linear_combination_residual_block_v2.h"
+#include "cutlass/epilogue/thread/linear_combination_residual_block.h"
 #include "cutlass/gemm/device/gemm_universal_with_broadcast.h"
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp"
+#include "cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp"
+
+using bfloat16 = nv_bfloat16;
+
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
@@ -319,13 +420,15 @@
   int64_t c_ptr_sz = c_dim0 * c_dim1;
   // The value 1 is used to force ptr_max_sz to be non-zero
   int64_t ptr_max_sz = std::max<int64_t>({1, a_ptr_sz, b_ptr_sz, c_ptr_sz});
-  // TODO: special pool size for A100 L2 cache 40M
-  // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 25) / ptr_max_sz)));
+  size_t one_copy_sz = a_ptr_sz + b_ptr_sz + c_ptr_sz + c_dim1 + c_ptr_sz;
+{% if has_d1 %}
+  one_copy_sz += c_ptr_sz;
+{%endif%}
+  int64_t mem_pool_sz = memory_pool->ComputeMemPoolSize(one_copy_sz, ptr_max_sz, device_properties.l2CacheSize);
 
   memory_pool->AllocateTensor(a_ptr_sz, mem_pool_sz);  // a_ptr: index 0
   memory_pool->AllocateTensor(b_ptr_sz, mem_pool_sz);  // b_ptr: index 1
-  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // c_ptr: index 2
+  memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz, /*is_output*/true);  // c_ptr: index 2
   memory_pool->AllocateTensor(c_dim1, mem_pool_sz);  // bias_ptr: index 3
   memory_pool->AllocateTensor(c_ptr_sz, mem_pool_sz);  // d0 ptr: index 4
 {% if has_d1 %}
@@ -339,6 +442,82 @@ def _support_split_k(func_attrs):
     return func_attrs["split_k"] is not None
 
 
+def _replace_epilogue_cutlass_3x(
+    op_def,
+    unary_op1,
+    binary_op1,
+    binary_op2,
+    unary_op2,
+):
+    # example of the generated epilogue replaced by this function:
+    # ------------------------------------------------------------
+    # using cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnt_align8_warpspecialized_pingpong_epi_tma_epilogue =
+    # typename cutlass::epilogue::collective::CollectiveBuilder<
+    #     cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    #     cute::Shape<cute::_256, cute::_128, cute::_64>,
+    #     cute::Shape<cute::_2,cute::_1,cute::_1>,
+    #     cutlass::epilogue::collective::EpilogueTileAuto,
+    #     float, float,
+    #     cutlass::half_t, cutlass::layout::RowMajor, 8,
+    #     cutlass::half_t, cutlass::layout::RowMajor, 8,
+    #     cutlass::epilogue::TmaWarpSpecialized
+    # >::CollectiveOp;
+
+    CUTLASS_3X_EPILOGUE_NUM_LINES = 11
+
+    lines = op_def.split("\n")
+    stripped_lines = [line.strip() for line in lines]
+    epilogue_start, epilogue_lines = None, []
+    for i in range(len(stripped_lines)):
+        if stripped_lines[i].endswith("_epilogue ="):
+            epilogue_start = i
+            for j in range(i, len(stripped_lines)):
+                epilogue_lines.append(stripped_lines[j])
+                if stripped_lines[j].endswith("::CollectiveOp;"):
+                    break
+            break
+
+    if epilogue_start is None:
+        raise ValueError(
+            f"Generated epilogue not found in the CUTLASS 3.x op_def:\n\n{op_def}"
+        )
+    if len(epilogue_lines) != CUTLASS_3X_EPILOGUE_NUM_LINES:
+        raise ValueError(
+            "Generated CUTLASS 3.x epilogue must be 11 lines long, "
+            f"but got {CUTLASS_3X_EPILOGUE_NUM_LINES}:\n\n{op_def}"
+        )
+
+    epilogue_name = epilogue_lines[0].split(" ")[1]
+    element_c, layout_c = epilogue_lines[7].split(", ")[:2]
+    element_d, layout_d = epilogue_lines[8].split(", ")[:2]
+    element_accumulator, element_compute = epilogue_lines[6].split(",")[:2]
+    element_compute = element_compute.strip()
+    epilogue_schedule = epilogue_lines[9]
+
+    new_epilogue = EPILOGUE_TENSOR_BROADCAST_TEMPLATE.render(
+        epilogue_name=epilogue_name,
+        element_c=element_c,
+        layout_c=layout_c,
+        element_d=element_d,
+        layout_d=layout_d,
+        element_accumulator=element_accumulator,
+        element_compute=element_compute,
+        unary_op1=unary_op1,
+        binary_op1=binary_op1,
+        binary_op2=binary_op2,
+        unary_op2=unary_op2,
+        epilogue_schedule=epilogue_schedule,
+        has_d1=(binary_op2 is not None),
+    )
+
+    lines_before = lines[:epilogue_start]
+    lines_after = lines[epilogue_start + CUTLASS_3X_EPILOGUE_NUM_LINES :]
+    new_lines = lines_before + [new_epilogue] + lines_after
+    new_op_def = "\n".join(new_lines)
+
+    return new_op_def
+
+
 def gemm_bias_broadcast_instance(
     op_def,
     func_attrs,
@@ -348,10 +527,21 @@ def gemm_bias_broadcast_instance(
     binary_op1,
     binary_op2,
     unary_op2,
+    elem_type,
+    cutlass_3x=False,
 ):
     """
     adjust gemm instance with respect to input_accessors, layout and epilogue ops
     """
+    if cutlass_3x:
+        return _replace_epilogue_cutlass_3x(
+            op_def=op_def,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+        )
+
     op_def = common.update_alignments_in_gemm_instance(op_def, func_attrs, for_profiler)
     gemm_universal_params = common.get_gemm_instance_template_params(op_def)
     epilogue_pattern = re.compile(r"\s*(cutlass::epilogue::thread::.*)\s*<")
@@ -363,6 +553,7 @@ def gemm_bias_broadcast_instance(
     if (
         "use_fp16_acc" in Target.current()._kwargs
         and Target.current()._kwargs["use_fp16_acc"]
+        and elem_type == "cutlass::half_t"
     ):
         acc_type = "cutlass::half_t"
     else:
@@ -384,6 +575,7 @@ def gemm_bias_broadcast_instance(
             alignment_b=gemm_universal_params[18],
             layout=layout,
             acc_type=acc_type,
+            elem_type=elem_type,
             has_d1=(binary_op2 is not None),
         )
     )
@@ -396,7 +588,11 @@ def gemm_bias_broadcast_instance(
 
 
 def gemm_bias_broadcast_config(func_attrs, layout, dtype="float16"):
-    common.make_fproc(func_attrs, layout)
+    common.make_fproc(
+        func_attrs=func_attrs,
+        layout=layout,
+        include_cutlass_3x_ops=True,
+    )
 
 
 def gen_profiler(
@@ -410,6 +606,12 @@ def gen_profiler(
     binary_op2,
     unary_op2,
 ):
+    import cutlass_lib
+
+    op_type = func_attrs["op"]
+    op_instance = func_attrs["op_instance"]
+    op_instance, _ = common.filter_cutlass_3x_ops(op_instance, func_attrs)
+
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
@@ -420,9 +622,7 @@ def gen_profiler(
     elem_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    op_type = func_attrs["op"]
     support_split_k = _support_split_k(func_attrs)
-    op_instance = func_attrs["op_instance"]
     has_d1 = common.has_d1(func_attrs)
 
     ndims = 2
@@ -445,6 +645,12 @@ def gen_profiler(
             layout=layout,
             has_d1=has_d1,
         ),
+        problem_args_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+            elem_input_type=elem_input_type,
+            elem_output_type=elem_output_type,
+            layout=layout,
+            has_d1=has_d1,
+        ),
     )
     input_output_checks = common.INPUT_OUTPUT_CHECKS_TEMPLATE.render(
         input_ndims=ndims,
@@ -466,13 +672,24 @@ def gen_profiler(
                 binary_op1=binary_op1,
                 binary_op2=binary_op2,
                 unary_op2=unary_op2,
+                elem_type=elem_input_type,
             ),
         )
-        config_name = common.extract_config_name(config)
         instance_name = f"{instance_name_base}_{instance_idx}"
         gemm_op = f"gemm_op_{instance_idx}"
-        instance = common.INSTANCE_TEMPLATE.render(
-            config_name=config_name, name=instance_name, config=config
+        cutlass_3x = op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x
+        instance_template = (
+            common.INSTANCE_TEMPLATE_CUTLASS_3X
+            if cutlass_3x
+            else common.INSTANCE_TEMPLATE
+        )
+        instance = instance_template.render(
+            config_name=common.extract_config_name(
+                config,
+                cutlass_3x=cutlass_3x,
+            ),
+            name=instance_name,
+            config=config,
         )
         benchmark_instance = common.BENCHMARK_INSTANCE_TEMPLATE.render(
             indent="  ",
@@ -480,20 +697,11 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
-            d_ptr="memory_pool->RequestTensorByIdx(4)",
-            d1_ptr="memory_pool->RequestTensorByIdx(5)",
-            bias_ptr="memory_pool->RequestTensorByIdx(3)",
             adims=adims,
             bdims=bdims,
             cdims=cdims,
             support_split_k=support_split_k,
             split_k="split_k",
-            has_bias=True,
-            has_d=True,
-            has_d1=has_d1,
         )
         instances.append(instance)
         benchmark_instances.append(benchmark_instance)
@@ -520,12 +728,12 @@ def gen_profiler(
     func_call = FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name="gemm",
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
-        c_ptr="c_ptr",
-        d0_ptr="d_ptr",
-        d1_ptr="d1_ptr",
-        bias_ptr="bias_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
+        d0_ptr="memory_pool->RequestTensorByIdx(4)",
+        d1_ptr="memory_pool->RequestTensorByIdx(5)",
+        bias_ptr="memory_pool->RequestTensorByIdx(3)",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
         cdims=benchmark_cdims,
@@ -590,15 +798,22 @@ def gen_function(
         support_split_k=support_split_k,
         has_d1=has_d1,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        layout=layout,
+        has_d1=has_d1,
+    )
     return common.gen_function(
-        func_attrs,
-        SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         f_instance_convertor=partial(
             gemm_bias_broadcast_instance,
             layout=layout,
@@ -606,6 +821,7 @@ def gen_function(
             binary_op1=binary_op1,
             binary_op2=binary_op2,
             unary_op2=unary_op2,
+            elem_type=elem_input_type,
         ),
         support_split_k=support_split_k,
         input_addr_calculator=input_addr_calculator,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
index 8c1e80cc3..86eef6ffc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_no_bias.py
@@ -26,6 +26,7 @@
 #include <random>
 #include <vector>
 #include <iostream>
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/util/host_tensor.h"
@@ -34,6 +35,13 @@
 #include "cutlass/util/reference/device/tensor_fill.h"
 #include "cutlass/util/device_memory.h"
 
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
 {{extra_code}}
 
 #define CUTLASS_CHECK(status)                                                         \\
diff --git a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
index 378911608..4b767535e 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/common_permute.py
@@ -22,9 +22,9 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import gemm_common
-from ..gemm_universal import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0301,C0415,R1705
 
@@ -279,11 +279,6 @@ def gen_profiler(
             gemm_op=gemm_op,
             gemm_op_name=op_name,
             func_name=f"benchmark_{function_name}",
-            a_ptr="memory_pool->RequestTensorByIdx(0)",
-            b_ptr="memory_pool->RequestTensorByIdx(1)",
-            has_bias=has_bias,
-            bias_ptr=bias_ptr_arg,
-            c_ptr="memory_pool->RequestTensorByIdx(2)",
             support_split_k=support_split_k,
             split_k="split_k",
             adims=adims,
@@ -312,11 +307,11 @@ def gen_profiler(
     func_call = common.FUNC_CALL_TEMPLATE.render(
         is_profiler=True,
         func_name=function_name,
-        a_ptr="a_ptr",
-        b_ptr="b_ptr",
+        a_ptr="memory_pool->RequestTensorByIdx(0)",
+        b_ptr="memory_pool->RequestTensorByIdx(1)",
         has_bias=has_bias,
-        bias_ptr="bias_ptr",
-        c_ptr="c_ptr",
+        bias_ptr=bias_ptr_arg,
+        c_ptr="memory_pool->RequestTensorByIdx(2)",
         split_k="split_k",
         adims=benchmark_adims,
         bdims=benchmark_bdims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
index 44c85125c..3f1d6aaa1 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr.py
@@ -19,11 +19,11 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common
-from .layout import RCR
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -47,22 +47,50 @@
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    input_a_batch_stride,
-    input_b_batch_stride,
-    /*output_batch_stride*/ M * N,
-    /*output_batch_stride*/ M * N,
-    input_a_stride,
-    input_b_stride,
-    output_stride,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    /*output_batch_stride*/ M * N,                           // int64_t batch_stride_C
+    /*output_batch_stride*/ M * N,                           // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementA const* ptr_A
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementB const* ptr_B
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
@@ -70,29 +98,64 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    K,
-    N,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    common.make_fproc(func_attrs, RCR)
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
+
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
 
 
 def common_gen_profiler(
@@ -102,6 +165,7 @@ def common_gen_profiler(
     dim_info_dict,
     src_template,
     problem_args_template,
+    problem_args_template_cutlass_3x=None,
     bias_ptr_arg=None,
     extra_code="",
 ):
@@ -109,13 +173,14 @@ def common_gen_profiler(
         stride_dim="*b_dim0"
     )
     return common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        src_template,
-        problem_args_template,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
         support_split_k=True,
         output_addr_calculator=output_addr_calculator,
         bias_ptr_arg=bias_ptr_arg,
@@ -126,12 +191,13 @@ def common_gen_profiler(
 @registry.reg("cuda.gemm_rcr.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -188,19 +254,25 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
-            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+            stride_dim="N",
+            output_accessor=func_attrs["output_accessors"][0],
         ),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
index 7c06c7408..1464383bc 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias.py
@@ -19,33 +19,93 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias, gemm_rcr
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rcr
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
+EXTRA_CODE = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
 # used for real execution
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr) + input_a_offset,
-    ({{elem_input_type}}*)(b_ptr) + input_b_offset,
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    input_a_batch_stride,
-    input_b_batch_stride,
-    /*bias_batch_stride*/ N,
-    /*output_batch_stride*/ M * N,
-    input_a_stride,
-    input_b_stride,
-    /*bias_stride*/ 0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# in case of TMA epilogue schedule, use the transposed problem to pass the
+# column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+{% if has_tma_epilogue %}
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementA const* ptr_A
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementB const* ptr_B
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
+{% else %}
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,              // ElementA const* ptr_A
+    {input_a_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideA dA
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,              // ElementB const* ptr_B
+    {input_b_stride, cute::Int<1>{}, cute::Int<0>{}},            // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+{% endif %}
 """
 )
 
@@ -53,41 +113,123 @@
 # for profiler, no need to include TensorAccessor
 PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# in case of TMA epilogue schedule, use the transposed problem to pass the
+# column-major bias vector through the bias + elementwise epilogue (not residual)
+PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+{% if has_tma_epilogue %}
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
+{% else %}
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
+{% endif %}
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return gemm_rcr.gemm_rcr_config(func_attrs, dtype)
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
+
+    import cutlass_lib
+
+    for op in func_attrs["op_instance"].values():
+        if common.has_tma_epilogue(op):
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
+
+            # swap the output layout to the transposed problem
+            op.C.layout = cutlass_lib.library.LayoutType.ColumnMajor
+            op.D.layout = cutlass_lib.library.LayoutType.ColumnMajor
+
+            # switch to a TMA epilogue with bias
+            op.epilogue_schedule = (
+                cutlass_lib.library.EpilogueScheduleBiasElementwiseMapping[
+                    op.epilogue_schedule
+                ]
+            )
 
 
 @registry.reg("cuda.gemm_rcr_bias.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return gemm_rcr.common_gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        PROFILER_PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROFILER_PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=extra_code,
     )
 
 
@@ -112,20 +254,34 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+        has_tma_epilogue=any(
+            common.has_tma_epilogue(func_attrs["op_instance"][exec_item.algo])
+            for exec_item in func_attrs["exec_path"].values()
+        ),
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         input_addr_calculator=input_addr_calculator,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
+        extra_code=extra_code,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
deleted file mode 100644
index c556485f1..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = ADD(GeMM(A, B) + bias, D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
deleted file mode 100644
index bd2988abf..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
deleted file mode 100644
index 5d262712e..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_add_relu.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(ADD(GeMM(A, B) + bias, D0), D1))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_add_relu.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
deleted file mode 100644
index 212b01a74..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_add_relu.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = RELU(ADD(GeMM(A, B) + bias, D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::plus"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::ReLu"
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_add_relu.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
new file mode 100644
index 000000000..01d62de36
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_elementwise.py
@@ -0,0 +1,135 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = UnaryOp2(BinaryOp2(BinaryOp1(UnaryOp1(GeMM(A, B) + bias), D1), D2)),
+"""
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_broadcast
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+UNARY_IDENTITY = "cutlass::epilogue::thread::Identity"
+UNARY_RELU = "cutlass::epilogue::thread::ReLu"
+UNARY_SIGMOID = "cutlass::epilogue::thread::Sigmoid"
+UNARY_TANH = "cutlass::epilogue::thread::Tanh"
+BINARY_PLUS = "cutlass::plus"
+BINARY_MULTIPLY = "cutlass::multiplies"
+
+
+_CONFIGS = [
+    # gemm_rcr_bias_add
+    ["add", UNARY_IDENTITY, BINARY_PLUS, None, UNARY_IDENTITY],
+    # gemm_rcr_bias_add_add
+    ["add_add", UNARY_IDENTITY, BINARY_PLUS, BINARY_PLUS, UNARY_IDENTITY],
+    # gemm_rcr_bias_add_relu
+    ["add_relu", UNARY_IDENTITY, BINARY_PLUS, None, UNARY_RELU],
+    # gemm_rcr_bias_add_add_relu
+    ["add_add_relu", UNARY_IDENTITY, BINARY_PLUS, BINARY_PLUS, UNARY_RELU],
+    # gemm_rcr_bias_mul
+    ["mul", UNARY_IDENTITY, BINARY_MULTIPLY, None, UNARY_IDENTITY],
+    # gemm_rcr_bias_mul_add
+    ["mul_add", UNARY_IDENTITY, BINARY_MULTIPLY, BINARY_PLUS, UNARY_IDENTITY],
+    # gemm_rcr_bias_mul_tanh
+    ["mul_tanh", UNARY_IDENTITY, BINARY_MULTIPLY, None, UNARY_TANH],
+    # gemm_rcr_bias_sigmoid_mul_tanh
+    ["sigmoid_mul_tanh", UNARY_SIGMOID, BINARY_MULTIPLY, None, UNARY_TANH],
+    # gemm_rcr_bias_sigmoid_mul
+    ["sigmoid_mul", UNARY_SIGMOID, BINARY_MULTIPLY, None, UNARY_IDENTITY],
+]
+
+
+def gemm_rcr_config(func_attrs, dtype="float16"):
+    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
+
+
+def gen_profiler_template(unary_op1, binary_op1, binary_op2, unary_op2):
+    def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+        return common_bias_broadcast.gen_profiler(
+            func_attrs=func_attrs,
+            workdir=workdir,
+            profiler_filename=profiler_filename,
+            dim_info_dict=dim_info_dict,
+            layout=RCR,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+        )
+
+    return gen_profiler
+
+
+def gen_function_template(unary_op1, binary_op1, binary_op2, unary_op2):
+    def gen_function(
+        func_attrs,
+        exec_cond_template,
+        dim_info_dict,
+    ):
+        return common_bias_broadcast.gen_function(
+            func_attrs=func_attrs,
+            exec_cond_template=exec_cond_template,
+            dim_info_dict=dim_info_dict,
+            layout=RCR,
+            unary_op1=unary_op1,
+            binary_op1=binary_op1,
+            binary_op2=binary_op2,
+            unary_op2=unary_op2,
+        )
+
+    return gen_function
+
+
+def gen_function_decl(func_attrs):
+    return common_bias_broadcast.gen_function_decl(func_attrs)
+
+
+def gen_function_call(func_attrs, indent="  "):
+    return common_bias_broadcast.gen_function_call(func_attrs, indent)
+
+
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
+
+
+for conf in _CONFIGS:
+    name, unary_op1, binary_op1, binary_op2, unary_op2 = conf
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.config")(gemm_rcr_config)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.gen_profiler")(
+        gen_profiler_template(unary_op1, binary_op1, binary_op2, unary_op2)
+    )
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.gen_function")(
+        gen_function_template(unary_op1, binary_op1, binary_op2, unary_op2)
+    )
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.func_decl")(gen_function_decl)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.func_call")(gen_function_call)
+    registry.reg(f"cuda.gemm_rcr_bias_{name}.filter")(function_filter)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 12af54f6a..e88acc73d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -18,8 +18,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -62,39 +62,75 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -106,10 +142,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
         extra_code=EXTRA_CODE.render(),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
index b4617b9d6..4d577b5e9 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_gelu.py
@@ -13,52 +13,88 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for C = fast_gelu(GeMM(A, B) + bias)
+GEMM Specialization for C = gelu(GeMM(A, B) + bias)
 where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][K], C[RowMajor][M, N]
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -69,10 +105,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
index a0952d345..3524f0c81 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -18,47 +18,83 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_hardswish.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_hardswish.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -69,10 +105,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
deleted file mode 100644
index 1b2dea303..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = ADD(GeMM(A, B) + bias, D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
deleted file mode 100644
index 12bce07ae..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_add.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = Add(Mul(GeMM(A, B) + bias, D0), D1),
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N], D1[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = "cutlass::plus"
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_add.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
deleted file mode 100644
index c8be43f28..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_mul_tanh.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = TANH(Mul((GeMM(A, B) + bias), D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Identity"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_mul_tanh.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
index 6abdcc977..36cfe36ab 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_permute.py
@@ -16,10 +16,15 @@
 GEMM with bias and permute epilogue fusion
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ..gemm_universal import common
-from . import common_bias, common_permute, gemm_rcr_bias, gemm_rcr_permute
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    common_bias,
+    common_permute,
+    gemm_rcr_bias,
+    gemm_rcr_permute,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
index eae96241c..b95f2bc10 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_relu.py
@@ -19,47 +19,83 @@
 
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                                            // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                                                  // GemmCoord problem_size
+    split_k,                                                                            // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                                                      // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                                                      // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                                                   // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,                                     // void * ptr_D
+    M * K,                                                                              // int64_t batch_stride_A
+    N * K,                                                                              // int64_t batch_stride_B
+    N,                                                                                  // int64_t batch_stride_C
+    M * N,                                                                              // int64_t batch_stride_D
+    K,                                                                                  // typename LayoutA::Stride::LongIndex lda
+    K,                                                                                  // typename LayoutB::Stride::LongIndex ldb
+    0,                                                                                  // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                                                      // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_relu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_relu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -70,10 +106,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
index e8ea6a976..1d4c20f23 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -19,47 +19,83 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_sigmoid.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -70,10 +106,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
deleted file mode 100644
index 2828d379d..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = Mul(Sigmoid(GeMM(A, B) + bias), D0)
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Identity"
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
deleted file mode 100644
index b3d721d6c..000000000
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-GEMM Specialization for
-C = TANH(Mul(Sigmoid(GeMM(A, B) + bias), D0))
-where A[RowMajor][M, K], B[ColMajor][N, K], C[RowMajor][M, N]
-bias[RowMajor][N], D0[RowMajor][M, N]
-"""
-from ... import registry
-from . import common, common_bias_broadcast
-from .layout import RCR
-
-# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
-
-UNARY_OP1 = "cutlass::epilogue::thread::Sigmoid"
-BINARY_OP1 = "cutlass::multiplies"
-BINARY_OP2 = None
-UNARY_OP2 = "cutlass::epilogue::thread::Tanh"
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.config")
-def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_broadcast.gemm_bias_broadcast_config(func_attrs, RCR)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
-    return common_bias_broadcast.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.gen_function")
-def gen_function(
-    func_attrs,
-    exec_cond_template,
-    dim_info_dict,
-):
-    return common_bias_broadcast.gen_function(
-        func_attrs,
-        exec_cond_template,
-        dim_info_dict,
-        RCR,
-        UNARY_OP1,
-        BINARY_OP1,
-        BINARY_OP2,
-        UNARY_OP2,
-    )
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_decl")
-def gen_function_decl(func_attrs):
-    return common_bias_broadcast.gen_function_decl(func_attrs)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    return common_bias_broadcast.gen_function_call(func_attrs, indent)
-
-
-@registry.reg("cuda.gemm_rcr_bias_sigmoid_mul_tanh.filter")
-def function_filter(cfg, func_attrs, ab_alignment):
-    """Generates function filter.
-
-    Parameters
-    ----------
-    cfg: str
-        The filename generated for profiler.
-    func_attrs : Dict
-        Stores the operation attributes.
-    ab_alignment:
-        Input alignments.
-
-    Returns
-    -------
-    bool
-        If input cfg should be filtered.
-    """
-    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
index e4c082580..91e17d474 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_swish.py
@@ -19,47 +19,83 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_swish.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_swish.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
     )
 
 
@@ -70,10 +106,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
index 934c9a1c0..afd0b09b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_bias_tanh.py
@@ -19,8 +19,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, common_bias_activation
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, common_bias_activation
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -62,39 +62,75 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_input_type}}*)(bias_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# as the epilouge schedule is always TMA, always use the transposed problem to pass
+# the column-major bias vector through the bias + elementwise epilogue (not residual)
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(b_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(a_ptr),                               // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {cute::Int<1>{}, cute::Int<0>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {cute::Int<1>{}, output_stride, cute::Int<0>{}},         // StrideD dD
+        ({{elem_input_type}}*)(bias_ptr),                        // ElementBias const* ptr_Bias
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_bias_tanh.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    return common_bias_activation.gemm_rcr_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.gemm_rcr_bias_tanh.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -106,10 +142,11 @@ def gen_function(
     dim_info_dict,
 ):
     return common_bias_activation.gen_function(
-        func_attrs,
-        PROBLEM_ARGS_TEMPLATE,
-        exec_cond_template,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
         extra_code=EXTRA_CODE.render(),
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
index 791f3e300..a0619f56a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_fast_gelu.py
@@ -18,10 +18,16 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    common_bias_activation,
+    common_no_bias,
+)
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
-from ...backend_spec import CUDASpec
-from . import common, common_bias_activation, common_no_bias
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -64,39 +70,68 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*) a_ptr,
-    ({{elem_input_type}}*) b_ptr,
-    nullptr,
-    ({{elem_output_type}}*) (c_ptr) + output_offset,
-    M * K,
-    N * K,
-    N,
-    M * N,
-    K,
-    K,
-    0,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*) a_ptr,                            // void const * ptr_A
+    ({{elem_input_type}}*) b_ptr,                            // void const * ptr_B
+    nullptr,                                                 // void const * ptr_C
+    ({{elem_output_type}}*) (c_ptr) + output_offset,         // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*) a_ptr,                                // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*) b_ptr,                                // ElementB const* ptr_B
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        ({{elem_output_type}}*) (c_ptr) + output_offset,         // ElementC const* ptr_C
+        {cute::Int<0>{}, cute::Int<1>{}, cute::Int<0>{}},        // StrideC dC
+        ({{elem_output_type}}*) (c_ptr) + output_offset,         // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rcr_fast_gelu.config")
 def gemm_rcr_config(func_attrs, dtype="float16"):
-    return common_bias_activation.gemm_rcr_config(func_attrs, dtype)
+    common.make_fproc(func_attrs, RCR, include_cutlass_3x_ops=True)
 
 
 @registry.reg("cuda.gemm_rcr_fast_gelu.gen_profiler")
 def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     return common_bias_activation.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        PROBLEM_ARGS_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
         extra_code=EXTRA_CODE.render(),
     )
 
@@ -121,15 +156,20 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common_no_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common_no_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N",
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
index b5f1cc9da..30740cdf7 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import common, common_permute
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -45,22 +45,26 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    K,
-    N,
-    output_stride
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    K,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -70,20 +74,13 @@ def gemm_rcr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
@@ -120,7 +117,8 @@ def common_gen_profiler(
 
 
 @registry.reg("cuda.gemm_rcr_permute.gen_profiler")
-def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict, extra_code=""):
+    extra_code = f"{common_permute.EXTRA_CODE.render()}\n{extra_code}"
     return common_gen_profiler(
         func_attrs,
         workdir,
@@ -128,7 +126,7 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         dim_info_dict,
         common.SRC_TEMPLATE,
         PROBLEM_ARGS_TEMPLATE,
-        extra_code=common_permute.EXTRA_CODE.render(),
+        extra_code=extra_code,
     )
 
 
@@ -138,6 +136,7 @@ def gen_function(
     exec_cond_template,
     dim_info_dict,
     problem_args_template=None,
+    extra_code="",
 ):
     backend_spec = CUDASpec()
     elem_input_type = backend_spec.dtype_to_lib_type(
@@ -160,6 +159,7 @@ def gen_function(
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    extra_code = f"{common_permute.EXTRA_CODE.render()}\n{extra_code}"
     return common_permute.gen_function(
         func_attrs,
         common.SRC_TEMPLATE,
@@ -174,7 +174,7 @@ def gen_function(
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
         ),
-        extra_code=common_permute.EXTRA_CODE.render(),
+        extra_code=extra_code,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
new file mode 100644
index 000000000..f90882e31
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rcr_permute_elup1.py
@@ -0,0 +1,209 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = permute(elu(GeMM(A, B) + bias) + 1.0)
+where A[RowMajor][M, K], B[ColMajor][N, K], bias[RowMajor][N]
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import gemm_rcr_permute
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/constants.h"
+#include "cutlass/complex.h"
+#include "cutlass/array.h"
+#include "cutlass/half.h"
+#include "cutlass/functional.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination_generic.h"
+
+#define CUDA_FP16_ZERO \
+  __half {             \
+    0x0u               \
+  }
+
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+// ELU(x; alpha = 1) + 1
+template <typename T>
+struct ELUp1 {
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& scalar) const {
+    return scalar >= T(0) ? scalar + T(1) : fast_exp(scalar);
+  }
+
+  using Params = LinearCombinationGenericParams<T>;
+
+  CUTLASS_HOST_DEVICE
+  T operator()(T const& scalar, Params const& params_) const {
+    return this->operator()(scalar);
+  }
+};
+
+template <>
+struct ELUp1<cutlass::half_t> {
+  CUTLASS_DEVICE
+  cutlass::half_t operator()(cutlass::half_t const& scalar) const {
+    half s = (half)scalar;
+    return (cutlass::half_t)(
+        __hadd(
+            __hmul(__hgt(s, CUDA_FP16_ZERO), __hadd(s, CUDA_FP16_ONE)),
+            __hmul(__hle(s, CUDA_FP16_ZERO), hexp(s))
+        )
+    );
+  }
+
+  using Params = LinearCombinationGenericParams<cutlass::half_t>;
+
+  CUTLASS_DEVICE
+  cutlass::half_t operator()(cutlass::half_t const& scalar, Params const& params_) const {
+    return this->operator()(scalar);
+  }
+};
+
+template <typename T, int N>
+struct ELUp1<Array<T, N>> {
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& value) const {
+    Array<T, N> y;
+    ELUp1<T> elup1_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      y[i] = elup1_op(value[i]);
+    }
+
+    return y;
+  }
+
+  using Params = LinearCombinationGenericParams<T>;
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const& value, Params const& params_)
+      const {
+    return this->operator()(value);
+  }
+};
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  ScaleType::Kind Scale = ScaleType::Default,          ///< Control Alpha and Beta scaling
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+using LinearCombinationELUp1 = LinearCombinationGeneric<ELUp1, ElementOutput_, Count, ElementAccumulator_,
+                                                          ElementCompute_, Scale, Round, false>;
+
+// The last template argument above (IsHeavy) being "false" is important for the functor
+// (here: ELUp1) to be inlined. Otherwise, the performance of the epilogue may worsen.
+// https://github.com/NVIDIA/cutlass/blob/7bdba07310b497e75c8377031e524fadc929b849/include/cutlass/epilogue/threadblock/epilogue_base.h#L74-L81
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+"""
+)
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.config")
+def gemm_rcr_permute_elup1_config(
+    func_attrs,
+    dtype="float16",
+):
+    gemm_rcr_permute.gemm_rcr_permute_config(
+        func_attrs=func_attrs,
+        dtype=dtype,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.gen_profiler")
+def gemm_rcr_permute_elup1_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+):
+    return gemm_rcr_permute.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.gen_function")
+def gemm_rcr_permute_elup1_gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+    problem_args_template=None,
+):
+    return gemm_rcr_permute.gen_function(
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        dim_info_dict=dim_info_dict,
+        problem_args_template=problem_args_template,
+        extra_code=EXTRA_CODE.render(),
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.func_decl")
+def gemm_rcr_permute_elup1_func_decl(func_attrs):
+    return gemm_rcr_permute.gen_function_decl(func_attrs)
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.func_call")
+def gemm_rcr_permute_elup1_func_call(
+    func_attrs,
+    indent="  ",
+):
+    return gemm_rcr_permute.gen_function_call(
+        func_attrs=func_attrs,
+        indent=indent,
+    )
+
+
+@registry.reg("cuda.gemm_rcr_permute_elup1.filter")
+def gemm_rcr_permute_elup1_filter(
+    cfg,
+    func_attrs,
+    ab_alignment,
+):
+    return gemm_rcr_permute.function_filter(
+        cfg=cfg,
+        func_attrs=func_attrs,
+        ab_alignment=ab_alignment,
+    )
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
index 90654c06f..f4b0a0d07 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr.py
@@ -19,10 +19,11 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -43,50 +44,97 @@
 """
 )
 
+
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    N,
-    N,
-    output_stride,
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+PROBLEM_ARGS_TEMPLATE_CUTLASS_3X = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                     // GemmUniversalMode mode
+    {
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K),
+        static_cast<coord_t>(1)
+    },                                                           // ProblemShape problem_shape
+    ({{elem_input_type}}*)(a_ptr),                               // ElementA const* ptr_A
+    {K, cute::Int<1>{}, cute::Int<0>{}},                         // StrideA dA
+    ({{elem_input_type}}*)(b_ptr),                               // ElementB const* ptr_B
+    {cute::Int<1>{}, N, cute::Int<0>{}},                         // StrideB dB
+    {
+        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename ThreadEpilogueOp::Params thread
+        nullptr,                                                 // ElementC const* ptr_C
+        {N, cute::Int<1>{}, cute::Int<0>{}},                     // StrideC dC
+        ({{elem_output_type}}*)(c_ptr) + output_offset,          // ElementD const* ptr_D
+        {output_stride, cute::Int<1>{}, cute::Int<0>{}},         // StrideD dD
+    },                                                           // EpilogueArguments epilogue
 """
 )
 
 
 @registry.reg("cuda.gemm_rrr.config")
 def gemm_rrr_config(func_attrs, dtype="float16"):
-    def fproc(op):
-        import cutlass_lib
+    common.make_fproc(func_attrs, RRR, include_cutlass_3x_ops=True)
 
-        from ...backend_spec import CUDASpec
+    import cutlass_lib
 
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
+    for op in func_attrs["op_instance"].values():
+        if op.gemm_kind == cutlass_lib.library.GemmKind.Universal3x:
+            # disable residual to leave more SMEM for the mainloop
+            op.C.element = cutlass_lib.library.DataType.void
 
-        return common.default_fproc(
-            op=op,
-            a_layout=cutlass_lib.library.LayoutType.RowMajor,
-            b_layout=cutlass_lib.library.LayoutType.RowMajor,
-            c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
-        )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+def common_gen_profiler(
+    func_attrs,
+    workdir,
+    profiler_filename,
+    dim_info_dict,
+    src_template,
+    problem_args_template,
+    problem_args_template_cutlass_3x=None,
+    bias_ptr_arg=None,
+    extra_code="",
+):
+    output_addr_calculator = common.DEFAULT_OUTPUT_ADDR_CALCULATOR.render(
+        stride_dim="*b_dim1"
+    )
+    return common.gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=src_template,
+        problem_args_template=problem_args_template,
+        problem_args_template_cutlass_3x=problem_args_template_cutlass_3x,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
+        support_split_k=True,
+        output_addr_calculator=output_addr_calculator,
+        bias_ptr_arg=bias_ptr_arg,
+        extra_code=extra_code,
+    )
 
 
 @registry.reg("cuda.gemm_rrr.gen_profiler")
@@ -95,18 +143,51 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
         stride_dim="N"
     )
     return common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        PROBLEM_ARGS_TEMPLATE,
-        ARGS_PARSER_TEMPLATE,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args_template=PROBLEM_ARGS_TEMPLATE,
+        problem_args_template_cutlass_3x=PROBLEM_ARGS_TEMPLATE_CUTLASS_3X,
+        args_parser_template=ARGS_PARSER_TEMPLATE,
         support_split_k=True,
         output_addr_calculator=output_addr_calculator,
     )
 
 
+def get_input_addr_calculator(func_attrs):
+    input_a_batch_stride_dim = "M * K"
+    input_a_stride_k_dim = "K"
+    input_a_offset = 0
+    input_b_batch_stride_dim = "K * N"
+    input_b_stride_k_dim = "N"
+    input_b_offset = 0
+
+    if "input_accessors" in func_attrs:
+        input_a_accessor = func_attrs["input_accessors"][0]
+        input_b_accessor = func_attrs["input_accessors"][1]
+        if input_a_accessor.is_from_strided_tensor:
+            input_a_offset = input_a_accessor.offset
+            shapes = input_a_accessor.original_shapes
+            input_a_stride_k_dim = input_a_accessor.stride(len(shapes) - 2)
+
+        if input_b_accessor.is_from_strided_tensor:
+            input_b_offset = input_b_accessor.offset
+            shapes = input_b_accessor.original_shapes
+            input_b_stride_k_dim = input_b_accessor.stride(len(shapes) - 2)
+
+    input_addr_calculator = common.INPUT_ADDR_CALCULATOR.render(
+        input_a_batch_stride_dim=input_a_batch_stride_dim,
+        input_a_stride_dim=input_a_stride_k_dim,
+        input_a_offset_val=input_a_offset,
+        input_b_batch_stride_dim=input_b_batch_stride_dim,
+        input_b_stride_dim=input_b_stride_k_dim,
+        input_b_offset_val=input_b_offset,
+    )
+    return input_addr_calculator
+
+
 @registry.reg("cuda.gemm_rrr.gen_function")
 def gen_function(
     func_attrs,
@@ -127,15 +208,20 @@ def gen_function(
         elem_input_type=elem_input_type,
         elem_output_type=elem_output_type,
     )
+    problem_args_cutlass_3x = PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
     return common.gen_function(
-        func_attrs,
-        common.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
-        input_ndims,
-        weight_ndims,
-        output_ndims,
-        dim_info_dict,
+        func_attrs=func_attrs,
+        src_template=common.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
         support_split_k=True,
         output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
             stride_dim="*b_dim1", output_accessor=func_attrs["output_accessors"][0]
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py
new file mode 100644
index 000000000..42c675ff0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_bias.py
@@ -0,0 +1,205 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+GEMM Specialization for
+C = GeMM(A, B) + bias
+where A[RowMajor][M, K], B[ColMajor][K, N], bias[RowMajor][N]
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_bias, gemm_rrr
+from aitemplate.backend.cuda.gemm_universal.layout import RRR
+
+# pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
+
+
+EXTRA_CODE = jinja2.Template(
+    """
+using elem_input_type = {{elem_input_type}};
+using elem_output_type = {{elem_output_type}};
+"""
+)
+
+
+# used for real execution
+PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr) + input_a_offset,          // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr) + input_b_offset,          // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    input_a_batch_stride,                                    // int64_t batch_stride_A
+    input_b_batch_stride,                                    // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    input_a_stride,                                          // typename LayoutA::Stride::LongIndex lda
+    input_b_stride,                                          // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+# for profiler, no need to include TensorAccessor
+PROFILER_PROBLEM_ARGS_TEMPLATE = jinja2.Template(
+    """
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_input_type}}*)(bias_ptr),                        // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    K * N,                                                   // int64_t batch_stride_B
+    N,                                                       // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    0,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
+"""
+)
+
+
+@registry.reg("cuda.gemm_rrr_bias.config")
+def gemm_rrr_config(func_attrs, dtype="float16"):
+    common.make_fproc(func_attrs, RRR)
+
+
+@registry.reg("cuda.gemm_rrr_bias.gen_profiler")
+def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    return gemm_rrr.common_gen_profiler(
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args_template=PROFILER_PROBLEM_ARGS_TEMPLATE,
+        bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.gen_function")
+def gen_function(
+    func_attrs,
+    exec_cond_template,
+    dim_info_dict,
+):
+    input_addr_calculator = gemm_rrr.get_input_addr_calculator(func_attrs)
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_output_type = backend_spec.dtype_to_lib_type(
+        func_attrs["outputs"][0]._attrs["dtype"]
+    )
+    problem_args = PROBLEM_ARGS_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    extra_code = EXTRA_CODE.render(
+        elem_input_type=elem_input_type,
+        elem_output_type=elem_output_type,
+    )
+    return common.gen_function(
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        output_ndims=output_ndims,
+        dim_info_dict=dim_info_dict,
+        support_split_k=True,
+        input_addr_calculator=input_addr_calculator,
+        output_addr_calculator=common.OUTPUT_ADDR_CALCULATOR.render(
+            stride_dim="N", output_accessor=func_attrs["output_accessors"][0]
+        ),
+        extra_code=extra_code,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
+    weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
+    return common_bias.FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        input_ndims=input_ndims,
+        weight_ndims=weight_ndims,
+        support_split_k=True,
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    bias = func_attrs["inputs"][2]
+    return common.gen_function_call(
+        func_attrs, indent, bias_ptr_arg=bias._attrs["name"]
+    )
+
+
+@registry.reg("cuda.gemm_rrr_bias.filter")
+def function_filter(cfg, func_attrs, ab_alignment):
+    """Generates function filter.
+
+    Parameters
+    ----------
+    cfg: str
+        The filename generated for profiler.
+    func_attrs : Dict
+        Stores the operation attributes.
+    ab_alignment:
+        Input alignments.
+
+    Returns
+    -------
+    bool
+        If input cfg should be filtered.
+    """
+    return common.function_filter(cfg, func_attrs, ab_alignment)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
index 4b7ced1ea..2e4d1ed0a 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/gemm_rrr_permute.py
@@ -19,10 +19,10 @@
 """
 import jinja2
 
-from ... import registry
+from aitemplate.backend import registry
 
-from ...backend_spec import CUDASpec
-from . import common, common_permute
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import common, common_permute
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -46,22 +46,26 @@
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-    cutlass::gemm::GemmUniversalMode::kGemm,
-    {M, N, K},
-    split_k,
-    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-    ({{elem_input_type}}*)(a_ptr),
-    ({{elem_input_type}}*)(b_ptr),
-    ({{elem_output_type}}*)(c_ptr),
-    ({{elem_output_type}}*)(c_ptr) + output_offset,
-    M * K,
-    N * K,
-    M * N,
-    M * N,
-    K,
-    N,
-    N,
-    output_stride,
+    cutlass::gemm::GemmUniversalMode::kGemm,                 // GemmUniversalMode mode
+    cutlass::gemm::GemmCoord{
+        static_cast<coord_t>(M),
+        static_cast<coord_t>(N),
+        static_cast<coord_t>(K)
+    },                                                       // GemmCoord problem_size
+    split_k,                                                 // int batch_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params epilogue
+    ({{elem_input_type}}*)(a_ptr),                           // void const * ptr_A
+    ({{elem_input_type}}*)(b_ptr),                           // void const * ptr_B
+    ({{elem_output_type}}*)(c_ptr),                          // void const * ptr_C
+    ({{elem_output_type}}*)(c_ptr) + output_offset,          // void * ptr_D
+    M * K,                                                   // int64_t batch_stride_A
+    N * K,                                                   // int64_t batch_stride_B
+    M * N,                                                   // int64_t batch_stride_C
+    M * N,                                                   // int64_t batch_stride_D
+    K,                                                       // typename LayoutA::Stride::LongIndex lda
+    N,                                                       // typename LayoutB::Stride::LongIndex ldb
+    N,                                                       // typename LayoutC::Stride::LongIndex ldc
+    output_stride,                                           // typename LayoutC::Stride::LongIndex ldd
 """
 )
 
@@ -71,20 +75,13 @@ def gemm_rrr_permute_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common.py b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
index 1185ab1ab..c06445e74 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common.py
@@ -21,9 +21,9 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from . import common
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.gemm_universal import common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -153,6 +153,8 @@
 #include "cutlass/util/reference/host/tensor_fill.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 
+using bfloat16 = nv_bfloat16;
+
 #define CUTLASS_CHECK(status)                                                         \\
   {                                                                                   \\
     cutlass::Status error = status;                                                   \\
@@ -288,14 +290,13 @@
 {{indent}}  );
 {{indent}}}
 
-
+{{indent}}using coord_t = cutlass::gemm::GemmCoord::Index;
 {{indent}}typename {{instance}}::Arguments arguments{
 
 {{problem_args}}
 
 {{indent}}};
 {% if is_profiler %}
-{{indent}}// Debug BGM: https://www.youtube.com/watch?v=rRwxfYlgG-M
 {{indent}}size_t workspace_size = gemm_op.get_workspace_size(arguments);
 {{indent}}cutlass::device_memory::allocation<uint8_t> local_workspace(workspace_size);
 {{indent}}workspace = local_workspace.get();
@@ -765,7 +766,12 @@ def update_alignments_in_group_gemm_instance(
     return "\n".join(instance_lines)
 
 
-def group_gemm_instance(op_def: str, func_attrs: Dict[str, Any], for_profiler: bool):
+def group_gemm_instance(
+    op_def: str,
+    func_attrs: Dict[str, Any],
+    for_profiler: bool,
+    cutlass_3x: bool = False,
+):
     # TODO: This is a dirty thing need to add an extra emitter to clean this up
     op_def = update_alignments_in_group_gemm_instance(op_def, func_attrs, for_profiler)
     tmp = op_def.replace("DefaultGemmUniversal", "DefaultGemmGrouped")
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
index c18ef3e5f..9e57686b4 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_common_bias.py
@@ -17,25 +17,25 @@
 """
 import jinja2
 
-from . import group_common
+from aitemplate.backend.cuda.gemm_universal import group_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-        problem_sizes_device,
-        problem_count,
-        threadblock_count,
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},
-        ({{elem_input_type}}**)(ptr_A),
-        ({{elem_input_type}}**)(ptr_B),
-        ({{elem_input_type}}**)(ptr_bias),
-        ({{elem_output_type}}**)ptr_C,
-        lda,
-        ldb,
-        ldc,
-        ldd
+    problem_sizes_device,                                    // GemmCoord *problem_sizes
+    problem_count,                                           // int problem_count
+    threadblock_count,                                       // int threadblock_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(1)},  // typename EpilogueOutputOp::Params output_op
+    ({{elem_input_type}}**)(ptr_A),                          // ElementA ** ptr_A
+    ({{elem_input_type}}**)(ptr_B),                          // ElementB ** ptr_B
+    ({{elem_input_type}}**)(ptr_bias),                       // ElementC ** ptr_C
+    ({{elem_output_type}}**)ptr_C,                           // ElementC ** ptr_D
+    lda,                                                     // typename LayoutA::Stride::LongIndex *lda
+    ldb,                                                     // typename LayoutB::Stride::LongIndex *ldb
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldc
+    ldd,                                                     // typename LayoutC::Stride::LongIndex *ldd
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
index 83f0e2aa0..03acac5df 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr.py
@@ -17,32 +17,32 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, group_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import common, group_common
+from aitemplate.backend.cuda.gemm_universal.layout import RCR
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 PROBLEM_ARGS_TEMPLATE = jinja2.Template(
     """
-        problem_sizes_device,
-        problem_count,
-        threadblock_count,
-        {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-        ({{elem_input_type}}**)(ptr_A),
-        ({{elem_input_type}}**)(ptr_B),
-        ({{elem_output_type}}**)(ptr_C),
-        ({{elem_output_type}}**)(ptr_C),
-        lda,
-        ldb,
-        ldc,
-        ldc
+    problem_sizes_device,                                    // GemmCoord *problem_sizes
+    problem_count,                                           // int problem_count
+    threadblock_count,                                       // int threadblock_count
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},  // typename EpilogueOutputOp::Params output_op
+    ({{elem_input_type}}**)(ptr_A),                          // ElementA ** ptr_A
+    ({{elem_input_type}}**)(ptr_B),                          // ElementB ** ptr_B
+    ({{elem_output_type}}**)(ptr_C),                         // ElementC ** ptr_C
+    ({{elem_output_type}}**)(ptr_C),                         // ElementC ** ptr_D
+    lda,                                                     // typename LayoutA::Stride::LongIndex *lda
+    ldb,                                                     // typename LayoutB::Stride::LongIndex *ldb
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldc
+    ldc,                                                     // typename LayoutC::Stride::LongIndex *ldd
 """
 )
 
 
 @registry.reg("cuda.group_gemm_rcr.config")
-def group_rcr_config(func_attrs, dtype="float16"):
+def group_rcr_config(func_attrs):
     common.make_fproc(func_attrs, RCR)
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
index 88c348d2e..e2a9589bd 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias.py
@@ -15,15 +15,19 @@
 """
 Codegen functions for group_gemm_rcr_bias.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias.gen_profiler")
@@ -36,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
index fc43233da..df4fccb31 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -15,15 +15,19 @@
 """
 Codegen functions for group_gemm_rcr_bias_relu.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_profiler")
@@ -36,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias_relu.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index bce93b575..4e6a6a15f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -15,15 +15,19 @@
 """
 Codegen functions for group_gemm_rcr_bias_sigmoid.
 """
-from ... import registry
-from . import common, group_common_bias, group_gemm_rcr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    common,
+    group_common_bias,
+    group_gemm_rcr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.config")
-def group_rcr_config(func_attrs, dtype="float16"):
-    group_gemm_rcr.group_rcr_config(func_attrs, dtype)
+def group_rcr_config(func_attrs):
+    group_gemm_rcr.group_rcr_config(func_attrs)
 
 
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_profiler")
@@ -36,12 +40,12 @@ def gen_profiler(func_attrs, workdir, profiler_filename, shape_template):
 @registry.reg("cuda.group_gemm_rcr_bias_sigmoid.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
 ):
     return group_common_bias.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
     )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/layout.py b/python/aitemplate/backend/cuda/gemm_universal/layout.py
index 8bab2b98e..568fa48c6 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/layout.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/layout.py
@@ -77,3 +77,53 @@ def cutlass_lib_layouts():
             cutlass_lib.library.LayoutType.ColumnMajor,
             cutlass_lib.library.LayoutType.RowMajor,
         ]
+
+
+@dataclass
+class RRR(Layout):
+    """
+    Layout: A[RowMajor], B[RowMajor], C[RowMajor]
+    """
+
+    cutlass_layout_a = "cutlass::layout::RowMajor"
+    cutlass_layout_b = "cutlass::layout::RowMajor"
+    cutlass_layout_c = "cutlass::layout::RowMajor"
+    stride_a = "K"
+    stride_b = "N"
+    stride_c = "N"
+
+    args_parser = """
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = K;
+  int64_t b_dim1 = N;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+"""
+
+    @staticmethod
+    def fproc_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        op.C.layout = row_major
+
+    @staticmethod
+    def fcond_op(op):
+        import cutlass_lib
+
+        row_major = cutlass_lib.library.LayoutType.RowMajor
+        return op.A.layout == row_major and op.B.layout == row_major
+
+    @staticmethod
+    def cutlass_lib_layouts():
+        """
+        return [layout_a, layout_b, layout_c] in the form of cutlass_lib definitions
+        """
+        import cutlass_lib
+
+        return [
+            cutlass_lib.library.LayoutType.RowMajor,
+            cutlass_lib.library.LayoutType.RowMajor,
+            cutlass_lib.library.LayoutType.RowMajor,
+        ]
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
index 7d1741c52..e6b51647d 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr.py
@@ -16,8 +16,8 @@
 Codegen functions for perm021fc_ccr, which computes
 [b, m, n] = bmm([b, k, m], [1, n, k]).
 """
-from ... import registry
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -46,20 +46,13 @@ def gemm_ccr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
index 69712f30f..d3946f532 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias.py
@@ -16,8 +16,13 @@
 Codegen functions for perm021fc_ccr_bias, which computes
 [b, m, n] = bmm([b, k, m], [1, n, k]) + bias[n].
 """
-from ... import registry
-from . import bmm_common, common, common_bias, perm021fc_ccr
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm021fc_ccr,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
index 76ac6533b..4bbc994ed 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -16,9 +16,9 @@
 Common functions and templates for perm021_ccr_bias_permute, which computes
 (A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
 """
-from ... import registry
+from aitemplate.backend import registry
 
-from . import (
+from aitemplate.backend.cuda.gemm_universal import (
     bmm_common,
     bmm_permute_common,
     common,
@@ -87,20 +87,13 @@ def config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
             permute_layout=func_attrs["layout"],
         )
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
index 3d08f0291..db645f28f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc.py
@@ -16,15 +16,17 @@
 Codegen functions for perm021fc_crc, which computes
 [b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)).
 """
-from ... import registry
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
 
 def _get_problem_info(**kwargs):
     problem_args = {
-        "problem_size": "{N, M, K}",
+        "problem_dim_0": "N",
+        "problem_dim_1": "M",
+        "problem_dim_2": "K",
         "bias_ptr": "c_ptr",
         "a_batch_stride": "0",
         "b_batch_stride": "K * M",
@@ -47,20 +49,13 @@ def gemm_crc_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.ColumnMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
         )
 
     func_attrs["op_instance"] = common.extract_config(fproc)
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
index 3e6497c76..3546cea7c 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm021fc_crc_bias.py
@@ -16,8 +16,13 @@
 Codegen functions for perm021fc_crc_bias, which computes
 [b, n, m](col) = bmm([1, k, n](col), [b, k, m](row)) + bias[n].
 """
-from ... import registry
-from . import bmm_common, common, common_bias, perm021fc_crc
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm021fc_crc,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -25,7 +30,9 @@
 def _get_problem_info(**kwargs):
     problem_args = {
         "beta_value": 1,
-        "problem_size": "{N, M, K}",
+        "problem_dim_0": "N",
+        "problem_dim_1": "M",
+        "problem_dim_2": "K",
         "bias_ptr": "bias_ptr",
         "a_batch_stride": "0",
         "b_batch_stride": "K * M",
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
index c414816d8..1e0273c00 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr.py
@@ -16,9 +16,9 @@
 Codegen functions for perm102_bmm_rcr, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -34,6 +34,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "K",
         "ldbias": "N * B",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": False,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -63,6 +66,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="K",
         ldbias="output_stride",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=False,
+        c_row_major=True,
     )
 
 
@@ -103,23 +109,19 @@ def gemm_rcr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.ColumnMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+    func_attrs["op_instance"] = common.extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.perm102_bmm_rcr.gen_profiler")
@@ -134,15 +136,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
     )
 
 
@@ -158,14 +167,21 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        "",  # input_addr_calculator
-        get_output_addr_calculator(func_attrs),
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator="",
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
index 92afe0ca5..6634ff80f 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rcr_bias.py
@@ -16,10 +16,17 @@
 Codegen functions for perm102_bmm_rcr_bias, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col)) + bias[n].
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common, common_bias, perm102_bmm_rcr
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm102_bmm_rcr,
+)
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -36,6 +43,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "K",
         "ldbias": "0",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": False,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -66,6 +76,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="K",
         ldbias="0",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=False,
+        c_row_major=True,
     )
 
 
@@ -86,15 +99,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
@@ -111,16 +131,23 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
 
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
index 2f8d35522..354a4392b 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr.py
@@ -16,10 +16,12 @@
 Codegen functions for perm102_bmm_rrr, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import bmm_common, common
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -35,6 +37,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "N",
         "ldbias": "N * B",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": True,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -64,6 +69,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="N",
         ldbias="output_stride",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=True,
+        c_row_major=True,
     )
 
 
@@ -72,23 +80,19 @@ def gemm_rrr_config(func_attrs, dtype="float16"):
     def fproc(op):
         import cutlass_lib
 
-        from ...backend_spec import CUDASpec
-
-        backend_spec = CUDASpec()
-        elem_type = backend_spec.dtype_to_lib_type(
-            func_attrs["inputs"][0]._attrs["dtype"]
-        )
-
         return common.default_fproc(
             op=op,
             a_layout=cutlass_lib.library.LayoutType.RowMajor,
             b_layout=cutlass_lib.library.LayoutType.RowMajor,
             c_layout=cutlass_lib.library.LayoutType.RowMajor,
-            elem_type=elem_type,
-            epiligue_name=func_attrs["epilogue"],
+            dtype=func_attrs["inputs"][0].dtype(),
+            epilogue_name=func_attrs["epilogue"],
         )
 
-    func_attrs["op_instance"] = common.extract_config(fproc)
+    func_attrs["op_instance"] = common.extract_config(
+        f_proc_op=fproc,
+        include_cutlass_3x_ops=True,
+    )
 
 
 @registry.reg("cuda.perm102_bmm_rrr.gen_profiler")
@@ -103,15 +107,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
     )
 
 
@@ -127,14 +138,21 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_function(
-        func_attrs,
-        exec_cond_template,
-        problem_args,
-        dim_info_dict,
-        "",  # input_addr_calculator
-        get_output_addr_calculator(func_attrs),
+        func_attrs=func_attrs,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        dim_info_dict=dim_info_dict,
+        input_addr_calculator="",
+        output_addr_calculator=get_output_addr_calculator(func_attrs),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
index e065d70c1..de73d4880 100644
--- a/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/backend/cuda/gemm_universal/perm102_bmm_rrr_bias.py
@@ -16,10 +16,17 @@
 Codegen functions for perm102_bmm_rrr_bias, which computes
 C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[n]
 """
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import bmm_common, common, common_bias, perm102_bmm_rrr
-from .perm102_bmm_rcr import get_output_addr_calculator
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.gemm_universal import (
+    bmm_common,
+    common,
+    common_bias,
+    perm102_bmm_rrr,
+)
+from aitemplate.backend.cuda.gemm_universal.perm102_bmm_rcr import (
+    get_output_addr_calculator,
+)
 
 # pylint: disable=C0103,C0415,W0613,C0301,R1705,R1703
 
@@ -36,6 +43,9 @@ def _get_default_problem_info(**kwargs):
         "ldb": "N",
         "ldbias": "0",
         "ldc": "N * B",
+        "a_row_major": True,
+        "b_row_major": True,
+        "c_row_major": True,
     }
     for k, v in kwargs.items():
         problem_args[k] = v
@@ -65,6 +75,9 @@ def _get_strided_problem_info(func_attrs):
         ldb="N",
         ldbias="0",
         ldc="output_stride",
+        a_row_major=True,
+        b_row_major=True,
+        c_row_major=True,
     )
 
 
@@ -85,15 +98,22 @@ def gen_profiler(func_attrs, workdir, profiler_filename, dim_info_dict):
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=mm_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=mm_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     return bmm_common.gen_profiler(
-        func_attrs,
-        workdir,
-        profiler_filename,
-        dim_info_dict,
-        common_bias.SRC_TEMPLATE,
-        problem_args,
-        args_parser,
+        func_attrs=func_attrs,
+        workdir=workdir,
+        profiler_filename=profiler_filename,
+        dim_info_dict=dim_info_dict,
+        src_template=common_bias.SRC_TEMPLATE,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
+        args_parser=args_parser,
         bias_ptr_arg="memory_pool->RequestTensorByIdx(3)",
     )
 
@@ -110,16 +130,23 @@ def gen_function(
     problem_args = bmm_common.PROBLEM_ARGS_TEMPLATE.render(
         mm_info=bmm_problem_info,
     )
+    problem_args_cutlass_3x = bmm_common.PROBLEM_ARGS_TEMPLATE_CUTLASS_3X.render(
+        mm_info=bmm_common.add_elem_types_to_mm_info(
+            mm_info=bmm_problem_info,
+            func_attrs=func_attrs,
+        ),
+    )
 
     input_ndims = len(func_attrs["input_accessors"][0].original_shapes)
     weight_ndims = len(func_attrs["input_accessors"][1].original_shapes)
     output_ndims = len(func_attrs["output_accessors"][0].original_shapes)
 
     return common.gen_function(
-        func_attrs,
-        common_bias.SRC_TEMPLATE,
-        exec_cond_template,
-        problem_args,
+        func_attrs=func_attrs,
+        src_template=common_bias.SRC_TEMPLATE,
+        exec_cond_template=exec_cond_template,
+        problem_args=problem_args,
+        problem_args_cutlass_3x=problem_args_cutlass_3x,
         input_ndims=input_ndims,
         weight_ndims=weight_ndims,
         output_ndims=output_ndims,
diff --git a/python/aitemplate/backend/cuda/groupnorm/__init__.py b/python/aitemplate/backend/cuda/groupnorm/__init__.py
index ee950628c..f98ae7ce9 100644
--- a/python/aitemplate/backend/cuda/groupnorm/__init__.py
+++ b/python/aitemplate/backend/cuda/groupnorm/__init__.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from . import groupnorm, groupnorm_swish
+from aitemplate.backend.cuda.groupnorm import groupnorm, groupnorm_swish
 
 __all__ = ["groupnorm", "groupnorm_swish"]
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
index e26d8cd62..6bb632055 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm_common import (
+from aitemplate.backend.cuda.groupnorm.groupnorm_common import (
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
     groupnorm_gen_function,
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
index d1a48f28b..5db6982aa 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_common.py
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...target import Target
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
 FUNC_SIGNATURE = jinja2.Template(
     """
@@ -31,6 +31,10 @@
                           void* gamma,
                           void* beta,
                           int N,
+                          int64_t* H,
+                          int64_t* W,
+                          int64_t* HO,
+                          int64_t* WO,
                           const float eps,
                           const int max_smem_size,
                           void* workspace,
@@ -49,6 +53,8 @@
 {{indent}}{
 {{indent}}  {{func_name}}(
 {{indent}}     {{output}}, {{input}}, {{gamma}}, {{beta}}, {{N}},
+{{indent}}     {{H}}, {{W}},
+{{indent}}     {{HO}}, {{WO}},
 {{indent}}     {{eps}}, max_smem_size_, global_workspace_,
 {{indent}}  stream /* default stream */
 {{indent}}  );
@@ -61,6 +67,7 @@
     """
 #include <cuda.h>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 
 #include <cub/cub.cuh>
@@ -71,6 +78,8 @@
 #include <math_constants.h>
 #include <assert.h>
 
+using bfloat16 = __nv_bfloat16;
+using bfloat16_2 = __nv_bfloat162;
 
 {{gamma_beta_const_defs}}
 
@@ -84,13 +93,16 @@
 
 {{func_signature}}
 {
-
-    return invokeGroupNorm_{{elem_input_type}}<{{FuseSwish}}, {{H}}, {{W}}, {{C}}, {{G}}>(
+    *HO = *H;
+    *WO = *W;
+    return invokeGroupNorm<{{elem_input_type}}, {{FuseSwish}}, {{C}}, {{G}}>(
             static_cast<{{elem_input_type}}*>(output),
             static_cast<{{elem_input_type}}*>(input),
             static_cast<{{elem_input_type}}*>(gamma),
             static_cast<{{elem_input_type}}*>(beta),
             N,
+            H,
+            W,
             eps,
             max_smem_size,
             workspace,
@@ -135,8 +147,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     use_swish = True if "swish" in func_attrs["name"] else False
     input_shape = func_attrs["inputs"][0].shape()
 
-    H = input_shape[1].value()
-    W = input_shape[2].value()
     C = input_shape[3].value()
     G = func_attrs["num_groups"]
 
@@ -154,8 +164,6 @@ def groupnorm_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
         FuseSwish="true" if use_swish else "false",
-        H=H,
-        W=W,
         C=C,
         G=G,
     )
@@ -177,6 +185,7 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
     output_name = func_attrs["outputs"][0]._attrs["name"]
     (input_name, gamma_name, beta_name) = get_input_names(func_attrs)
     input_shape = func_attrs["inputs"][0]._attrs["shape"]
+    output_shape = func_attrs["outputs"][0]._attrs["shape"]
     eps = func_attrs["eps"]
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
@@ -185,6 +194,10 @@ def groupnorm_gen_func_call(func_attrs: Dict[str, Any], indent="  ") -> str:
         gamma=gamma_name,
         beta=beta_name,
         N=input_shape[0]._attrs["name"],
+        H="&" + input_shape[1]._attrs["name"],
+        W="&" + input_shape[2]._attrs["name"],
+        HO="&" + output_shape[1]._attrs["name"],
+        WO="&" + output_shape[2]._attrs["name"],
         eps=eps,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
index 2a22ed903..0ccc9e105 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_kernel.cuh
@@ -15,7 +15,7 @@
 #ifndef GROUPNORM_KERNEL_CUH
 #define GROUPNORM_KERNEL_CUH
 
-#define FINAL_MASK 0xffffffff
+constexpr uint32_t kFinalMask = 0xffffffff;
 
 #ifndef GROUP_NORM_CUDA_CHECK
 #define GROUP_NORM_CUDA_CHECK(expr)                                       \
@@ -33,29 +33,105 @@
 #define GROUP_NORM_CUDA_CHECK_LAUNCH() GROUP_NORM_CUDA_CHECK(cudaGetLastError())
 #endif
 
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
 __device__ half fast_tanh(half x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#else
   return half(cutlass::fast_tanh(float(x)));
+#endif
 }
 
-__inline__ __device__ float sigmoid(float val) {
-  return (cutlass::fast_tanh(val * 0.5f) + 1.0f) * 0.5f;
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900)
+  asm volatile("tanh.approx.bf16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return cutlass::fast_tanh(float(x));
+#else
+  NOT_IMPLEMENTED();
+#endif
 }
 
-__device__ half constant_half() {
-  uint16_t bits = 0x3800u;
-  return reinterpret_cast<half const&>(bits);
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+
+__device__ float sigmoid(const float a) {
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
 }
 
-__device__ half one() {
-  uint16_t bits = 0x3c00u;
-  return reinterpret_cast<half const&>(bits);
+__device__ half hsigmoid(const half a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
 }
 
-__inline__ __device__ half hsigmoid(half a) {
-  half half_val = constant_half();
-  half one_val = one();
-  return __hmul((__hadd(fast_tanh(__hmul(a, half_val)), one_val)), half_val);
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 bf16sigmoid(const bfloat16 a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
 }
+#endif
+
+template <typename T>
+struct FSigmoid {
+  __inline__ __device__ T operator()(const T input) const;
+};
+
+template <>
+struct FSigmoid<half> {
+  __inline__ __device__ half operator()(const half a) const {
+    return hsigmoid(a);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct FSigmoid<bfloat16> {
+  __inline__ __device__ bfloat16 operator()(const bfloat16 a) const {
+    return bf16sigmoid(a);
+  }
+};
+#endif
+
+template <>
+struct FSigmoid<float> {
+  __inline__ __device__ float operator()(const float a) const {
+    return sigmoid(a);
+  }
+};
 
 ////////////////////////////////////////////////////////////////////////////////
 // The Groupnorm implementation below is based on OneFlow's Layernorm
@@ -110,11 +186,19 @@ __forceinline__ __device__ half Rsqrt<half>(half x) {
   return hrsqrt(x);
 }
 
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+__forceinline__ __device__ bfloat16 Rsqrt<bfloat16>(bfloat16 x) {
+  return hrsqrt(x);
+}
+#endif
+
 #undef __AIT_GN_USE_FAST_MATH
 
 template <typename T>
 inline __device__ void WelfordCombine(T val, T* mean, T* m2, int* count) {
-  // Use Welford Online algorithem to compute mean and variance
+  // Use Welford Online algorithm to compute mean and variance
   // For more details you can refer to:
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   *count += 1;
@@ -157,10 +241,10 @@ __inline__ __device__ void WelfordWarpReduce(
   *m2 = thread_m2;
   *count = thread_count;
   for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
-    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
-    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    T b_mean = __shfl_down_sync(kFinalMask, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(kFinalMask, *m2, mask, thread_group_width);
     int b_count =
-        __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+        __shfl_down_sync(kFinalMask, *count, mask, thread_group_width);
     WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
   }
 }
@@ -347,47 +431,97 @@ __inline__ __device__ T BlockAllReduce(T val) {
   return result_broadcast;
 }
 
+namespace detail {
+
+template <typename TInput>
+struct TInputHelper;
+
+template <>
+struct TInputHelper<half> {
+  typedef __half2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return __half22float2(a);
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return __float22half2_rn(a);
+  }
+};
+
+template <>
+struct TInputHelper<float> {
+  typedef float2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return a;
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return a;
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct TInputHelper<bfloat16> {
+  typedef bfloat16_2 vec2_type;
+  static __inline__ __device__ float2 to_float2(vec2_type a) {
+    return __bfloat1622float2(a);
+  }
+  static __inline__ __device__ vec2_type to_vec2(float2 a) {
+    return __float22bfloat162_rn(a);
+  }
+};
+#endif
+
+} // namespace detail
+
 template <
+    typename TInput,
     bool FuseSwish,
-    int H,
-    int W,
     int C,
     int C_G,
     int ILP = 8,
     int BANK_CONFLICT = 0,
     int NUM_THREADS = 1024>
 __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
-    const half* X,
-    half* Y,
-    half* gamma,
-    half* beta,
+    const TInput* X,
+    TInput* Y,
+    TInput* gamma,
+    TInput* beta,
     int N,
+    int H,
+    int W,
     float epsilon) {
   constexpr int C_G_2 = C_G / 2;
   constexpr int C_G_stride = C_G_2 + BANK_CONFLICT;
   extern __shared__ int svals_[];
-  auto* svals = reinterpret_cast<__half2*>(&svals_[0]);
+  using vec2_type = typename detail::TInputHelper<TInput>::vec2_type;
+  auto to_float2 = detail::TInputHelper<TInput>::to_float2;
+  auto to_vec2 = detail::TInputHelper<TInput>::to_vec2;
+  auto* svals = reinterpret_cast<vec2_type*>(&svals_[0]);
 
-  int32_t g = blockIdx.x;
-  int32_t start_c = g * C_G;
-  int32_t n = blockIdx.y;
+  const int32_t g = blockIdx.x;
+  const int32_t start_c = g * C_G;
+  const int32_t n = blockIdx.y;
 
   // X: [N, H, W, C]
-  int32_t strides[4] = {H * W * C, W * C, C, 1};
+  // last stride is 1
+  const int32_t src_strides[3] = {H * W * C, W * C, C};
+  const int32_t smem_strides[2] = {W * C_G_stride, C_G_stride};
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      const __half2* src = reinterpret_cast<const __half2*>(
-          &(X[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
-              w * strides[2] + (start_c + c_g_2 * 2)]));
-      __half2* dst =
-          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      cutlass::arch::cp_async_zfill<sizeof(__half2)>(dst, src, true);
+      const vec2_type* const src = reinterpret_cast<const vec2_type*>(
+          &(X[n * src_strides[0] + (h_ilp * ILP + ii) * src_strides[1] +
+              w * src_strides[2] + (start_c + c_g_2 * 2)]));
+      vec2_type* const dst = &svals
+                                 [(h_ilp * ILP + ii) * smem_strides[0] +
+                                  w * smem_strides[1] + c_g_2];
+      cutlass::arch::cp_async_zfill<sizeof(vec2_type)>(dst, src, true);
     }
   }
   cutlass::arch::cp_async_wait<0>();
@@ -395,14 +529,14 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
   float thread_sum = 0;
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      half2 valh =
-          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      float2 val = __half22float2(valh);
+      const vec2_type valh = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      const float2 val = to_float2(valh);
       thread_sum += val.x + val.y;
     }
   }
@@ -413,15 +547,15 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
   float thread_sq_sum = 0;
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      half2 valh =
-          svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      float2 val = __half22float2(valh);
+      const vec2_type valh = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      const float2 val = to_float2(valh);
       thread_sq_sum += (val.x - block_mean) * (val.x - block_mean) +
           (val.y - block_mean) * (val.y - block_mean);
     }
@@ -434,34 +568,37 @@ __global__ __launch_bounds__(NUM_THREADS) void group_norm_smem(
 
   for (int32_t load_idx = threadIdx.x; load_idx < H / ILP * W * C_G_2;
        load_idx += blockDim.x) {
-    auto c_g_2 = load_idx % C_G_2;
-    auto w = (load_idx / C_G_2) % W;
-    auto h_ilp = ((load_idx / C_G_2) / W);
-
-    auto g = __half22float2(
-        *reinterpret_cast<const __half2*>(&gamma[start_c + c_g_2 * 2]));
-    g.x *= block_inv_std;
-    g.y *= block_inv_std;
-    auto b = __half22float2(
-        *reinterpret_cast<const __half2*>(&beta[start_c + c_g_2 * 2]));
+    const auto c_g_2 = load_idx % C_G_2;
+    const auto w = (load_idx / C_G_2) % W;
+    const auto h_ilp = ((load_idx / C_G_2) / W);
+
+    const auto dst_stride3_offset = start_c + c_g_2 * 2;
+    const auto g_v2 =
+        *reinterpret_cast<const vec2_type*>(gamma + dst_stride3_offset);
+    auto g_f2 = to_float2(g_v2);
+    g_f2.x *= block_inv_std;
+    g_f2.y *= block_inv_std;
+    const auto b_v2 =
+        *reinterpret_cast<const vec2_type*>(beta + dst_stride3_offset);
+    const auto b_f2 = to_float2(b_v2);
 
 #pragma unroll ILP
     for (auto ii = 0; ii < ILP; ++ii) {
-      __half2* src =
-          &svals[(h_ilp * ILP + ii) * W * C_G_stride + w * C_G_stride + c_g_2];
-      __half2* dst = reinterpret_cast<__half2*>(
-          &(Y[n * strides[0] + (h_ilp * ILP + ii) * strides[1] +
-              w * strides[2] + (start_c + c_g_2 * 2)]));
+      const vec2_type src = svals
+          [(h_ilp * ILP + ii) * smem_strides[0] + w * smem_strides[1] + c_g_2];
+      vec2_type* const dst = reinterpret_cast<vec2_type*>(
+          &(Y[n * src_strides[0] + (h_ilp * ILP + ii) * src_strides[1] +
+              w * src_strides[2] + dst_stride3_offset]));
 
-      auto fsrc = __half22float2(*src);
+      const auto fsrc = to_float2(src);
       float2 result;
-      result.x = (fsrc.x - block_mean) * g.x + b.x;
-      result.y = (fsrc.y - block_mean) * g.y + b.y;
+      result.x = (fsrc.x - block_mean) * g_f2.x + b_f2.x;
+      result.y = (fsrc.y - block_mean) * g_f2.y + b_f2.y;
       if (FuseSwish) {
         result.x = result.x * sigmoid(result.x);
         result.y = result.y * sigmoid(result.y);
       }
-      *dst = __float22half2_rn(result);
+      *dst = to_vec2(result);
     }
   }
 }
@@ -560,7 +697,7 @@ struct AffineStore {
       gamma_val = gamma[gamma_beta_offset];
       beta_val = beta[gamma_beta_offset];
     }
-
+    FSigmoid<DST> fsigmoid;
 #pragma unroll
     for (int i = 0; i < PackSize; ++i) {
       DST normalized_i = static_cast<DST>(src[i]);
@@ -571,7 +708,7 @@ struct AffineStore {
         y_pack.elem[i] = normalized_i;
       }
       if (FuseSwish) {
-        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+        y_pack.elem[i] = y_pack.elem[i] * fsigmoid(y_pack.elem[i]);
       }
     }
     *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) +
@@ -673,6 +810,7 @@ struct ChannelsLastStore {
             gamma_beta_offset);
     }
 
+    FSigmoid<DST> fsigmoid;
 #pragma unroll
     for (int i = 0; i < PackSize; ++i) {
       DST normalized_i = static_cast<DST>(src[i]);
@@ -683,7 +821,7 @@ struct ChannelsLastStore {
         y_pack.elem[i] = normalized_i;
       }
       if (FuseSwish) {
-        y_pack.elem[i] = y_pack.elem[i] * hsigmoid(y_pack.elem[i]);
+        y_pack.elem[i] = y_pack.elem[i] * fsigmoid(y_pack.elem[i]);
       }
     }
     *(reinterpret_cast<layer_norm::PackType<DST, PackSize>*>(y) + y_offset) =
@@ -756,7 +894,6 @@ void GroupNormForwardGpu(
     ComputeType* mean,
     ComputeType* inv_variance,
     bool channels_first) {
-  // using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
   if (channels_first) {
     layer_norm::DirectLoad<T, ComputeType> load(x_ptr, norm_size);
     AffineStore<ComputeType, T, affine, FuseSwish> store(
@@ -812,7 +949,7 @@ void DispatchGroupNormForwardGpu(
     T2* mean,
     T2* inv_variance,
     bool channels_first) {
-  using ComputeType = typename layer_norm::DefaultComputeType<T>::type;
+  using ComputeType = T2;
   if (gamma_ptr != nullptr && beta_ptr != nullptr) {
     GroupNormForwardGpu<T, ComputeType, true, FuseSwish>(
         stream,
@@ -846,13 +983,15 @@ void DispatchGroupNormForwardGpu(
   }
 }
 
-template <bool FuseSwish, int H, int W, int C, int G>
-cudaError_t invokeGroupNorm_half(
-    half* output,
-    half* input,
-    half* gamma,
-    half* beta,
+template <typename TInput, bool FuseSwish, int C, int G>
+cudaError_t invokeGroupNorm(
+    TInput* output,
+    TInput* input,
+    TInput* gamma,
+    TInput* beta,
     int N,
+    const int64_t* height,
+    const int64_t* width,
     const float eps,
     const int max_smem_size,
     void* workspace,
@@ -861,6 +1000,9 @@ cudaError_t invokeGroupNorm_half(
   constexpr auto C_G_2 = C_G / 2;
   constexpr int ILP = 8;
 
+  int64_t H = *height;
+  int64_t W = *width;
+
   const int64_t num_instances = N * G;
   const int64_t norm_size = H * W * C / G;
   const int64_t spatial_size = H * W;
@@ -868,58 +1010,45 @@ cudaError_t invokeGroupNorm_half(
   const double epsilon = eps;
   bool channels_first = false;
 
-  // Use a little big more shared_memory to reduce occupancy and boost perf.
+  // Use a little bit more shared_memory to reduce occupancy and boost perf.
   constexpr int MEM_BANK_CONFLICT = 1;
 
   // Bank conflict doesn't seem to matter to perf
   constexpr int BANK_CONFLICT = 0;
 
-  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(uint16_t);
+  const auto smem = H * W * (C_G_2 + MEM_BANK_CONFLICT) * 2 * sizeof(TInput);
 
   // C_G must be even, or we can have misaligned address for cp.async
   // reserve some shared_mem for block reduction
-  if (H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+  if (H > 0 && H % 8 == 0 && C_G % 2 == 0 && smem <= max_smem_size - 1000) {
+    constexpr int num_threads = 1024;
+    auto kernel_func = group_norm_smem<
+        TInput,
+        FuseSwish,
+        C,
+        C_G,
+        ILP,
+        BANK_CONFLICT,
+        num_threads>;
     GROUP_NORM_CUDA_CHECK(cudaFuncSetAttribute(
-        group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem));
-
-    constexpr int num_threads = std::min(1024, H / ILP * W * C_G_2);
-    if constexpr (num_threads > 0) {
-      dim3 block(num_threads);
-      group_norm_smem<FuseSwish, H, W, C, C_G, ILP, BANK_CONFLICT, num_threads>
-          <<<dim3(G, N), block, smem, stream>>>(
-              input, output, gamma, beta, N, eps);
-    } else {
-      DispatchGroupNormForwardGpu<half, float, FuseSwish>(
-          stream,
-          num_instances,
-          norm_size,
-          channel_size,
-          spatial_size,
-          epsilon,
-          static_cast<half*>(input),
-          static_cast<half*>(gamma),
-          static_cast<half*>(beta),
-          static_cast<half*>(output),
-          reinterpret_cast<float*>(workspace),
-          reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
-          channels_first);
-    }
+        kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, smem));
+    dim3 block(num_threads);
+    kernel_func<<<dim3(G, N), block, smem, stream>>>(
+        input, output, gamma, beta, N, H, W, eps);
   } else {
-    DispatchGroupNormForwardGpu<half, float, FuseSwish>(
+    DispatchGroupNormForwardGpu<TInput, float, FuseSwish>(
         stream,
         num_instances,
         norm_size,
         channel_size,
         spatial_size,
         epsilon,
-        static_cast<half*>(input),
-        static_cast<half*>(gamma),
-        static_cast<half*>(beta),
-        static_cast<half*>(output),
-        reinterpret_cast<float*>(workspace),
-        reinterpret_cast<float*>(workspace + sizeof(float) * num_instances),
+        input,
+        gamma,
+        beta,
+        output,
+        static_cast<float*>(workspace),
+        static_cast<float*>(workspace) + num_instances,
         channels_first);
   }
 
diff --git a/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
index 0f2b00dac..3106ce62d 100644
--- a/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
+++ b/python/aitemplate/backend/cuda/groupnorm/groupnorm_swish.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm_common import (
+from aitemplate.backend.cuda.groupnorm.groupnorm_common import (
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
     groupnorm_gen_function,
diff --git a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
index 386fd69ae..387be808a 100644
--- a/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
+++ b/python/aitemplate/backend/cuda/groupnorm/layer_norm.cuh
@@ -142,23 +142,6 @@ inline cudaError_t GetNumBlocks(
   return cudaSuccess;
 }
 
-template <typename T>
-struct DefaultComputeType {
-  using type = T;
-};
-
-template <>
-struct DefaultComputeType<half> {
-  using type = float;
-};
-
-// #if CUDA_VERSION >= 11000
-// template<>
-// struct DefaultComputeType<nv_bfloat16> {
-//   using type = float;
-// };
-// #endif  // CUDA_VERSION >= 11000
-
 template <typename T>
 class HasCanPackAs {
   typedef char one;
@@ -244,7 +227,7 @@ struct DirectStore {
 
 template <typename T>
 inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
-  // Use Welford Online algorithem to compute mean and variance
+  // Use Welford Online algorithm to compute mean and variance
   // For more details you can refer to:
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
   *count += 1;
diff --git a/python/aitemplate/backend/cuda/jagged/__init__.py b/python/aitemplate/backend/cuda/jagged/__init__.py
new file mode 100644
index 000000000..550a59a2b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/__init__.py
@@ -0,0 +1,26 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA jagged tensor-specific ops module init
+"""
+from aitemplate.backend.cuda.jagged import (
+    jagged_lengths_to_offsets,
+    jagged_lengths_to_presences,
+)
+
+__all__ = [
+    "jagged_lengths_to_offsets",
+    "jagged_lengths_to_presences",
+]
diff --git a/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..d47e8925e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_offsets.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the jagged_lengths_to_offsets op.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <stdexcept>
+
+#include <cub/cub.cuh>
+
+void {{func_name}}(
+    const void* lengths,
+    void* offsets,
+    {{index_type}} batch_size,
+    {{index_type}}* offsets_size,
+    void* workspace,
+    cudaStream_t stream
+) {
+    *offsets_size = batch_size + 1;
+
+    // pre-initialize all offset values to zero
+    cudaMemsetAsync(offsets, 0, (*offsets_size) * sizeof({{offsets_type}}), stream);
+
+    // no-op call to determine the temp storage size;
+    // although we don't need this call (because the workspace
+    // is pre-allocated to a sufficiently large size), unless
+    // the exact size determined by it is passed to the
+    // following call, it won't perform any computation
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(
+        nullptr,
+        temp_storage_bytes,
+        reinterpret_cast<const {{offsets_type}}*>(lengths),
+        reinterpret_cast<{{offsets_type}}*>(offsets) + 1,
+        (int)batch_size,
+        stream
+    );
+
+    if (temp_storage_bytes > {{workspace_size}}) {
+        throw std::runtime_error("Pre-allocated workspace size ({{workspace_size}} bytes) is too small.");
+    }
+
+    // compute the actual offsets, starting from the offsets[1]
+    cub::DeviceScan::InclusiveSum(
+        workspace,
+        temp_storage_bytes,
+        reinterpret_cast<const {{offsets_type}}*>(lengths),
+        reinterpret_cast<{{offsets_type}}*>(offsets) + 1,
+        (int)batch_size,
+        stream
+    );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    const void*,      /* lengths */
+    void*,            /* offsets */
+    {{index_type}},   /* batch_size */
+    {{index_type}}*,  /* offsets_size */
+    void*,            /* workspace */
+    cudaStream_t      /* stream */
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{lengths}},
+{{indent}}    {{offsets}},
+{{indent}}    {{batch_size}},
+{{indent}}    &{{offsets_size}},
+{{indent}}    global_workspace_,
+{{indent}}    stream
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.gen_function")
+def jagged_lengths_to_offsets_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    offsets = func_attrs["outputs"][0]
+    offsets_type = backend_spec.dtype_to_backend_type(offsets.dtype())
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+        offsets_type=offsets_type,
+        workspace_size=func_attrs["workspace"],
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.func_decl")
+def jagged_lengths_to_offsets_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_offsets.func_call")
+def jagged_lengths_to_offsets_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    lengths = func_attrs["inputs"][0]
+    offsets = func_attrs["outputs"][0]
+    batch_size = lengths.shape()[0]
+    offsets_size = offsets.shape()[0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        lengths=lengths._attrs["name"],
+        offsets=offsets._attrs["name"],
+        batch_size=batch_size._attrs["name"],
+        offsets_size=offsets_size._attrs["name"],
+    )
diff --git a/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py
new file mode 100644
index 000000000..91e5f528f
--- /dev/null
+++ b/python/aitemplate/backend/cuda/jagged/jagged_lengths_to_presences.py
@@ -0,0 +1,139 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the jagged_lengths_to_presences op.
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+using bfloat16 = nv_bfloat16;
+
+#define THREADS_PER_BLOCK 128
+
+
+namespace {
+
+__global__ void jagged_lengths_to_presences_kernel(
+    const {{lengths_type}}* lengths,
+    {{presences_type}}* presences
+) {
+    {{index_type}} bid = blockIdx.y;
+    {{index_type}} tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (tid < {{max_seq_len}}) {
+        {{lengths_type}} len = lengths[bid];
+        presences[bid * {{max_seq_len}} + tid] = static_cast<{{presences_type}}>(tid < len);
+    }
+}
+
+} // namespace
+
+
+void {{func_name}}(
+    const void* lengths,
+    void* presences,
+    {{index_type}} batch_size,
+    cudaStream_t stream
+) {
+    dim3 grid_size(({{max_seq_len}} + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, batch_size);
+    jagged_lengths_to_presences_kernel<<<grid_size, THREADS_PER_BLOCK, 0, stream>>>(
+        reinterpret_cast<const {{lengths_type}}*>(lengths),
+        reinterpret_cast<{{presences_type}}*>(presences)
+    );
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    const void*,      /* lengths */
+    void*,            /* presences */
+    {{index_type}},   /* batch_size */
+    cudaStream_t      /* stream */
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{lengths}},
+{{indent}}    {{presences}},
+{{indent}}    {{batch_size}},
+{{indent}}    stream
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.gen_function")
+def jagged_lengths_to_presences_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    lengths = func_attrs["inputs"][0]
+    presences = func_attrs["outputs"][0]
+    lengths_type = backend_spec.dtype_to_backend_type(lengths.dtype())
+    presences_type = backend_spec.dtype_to_backend_type(presences.dtype())
+    max_seq_len = presences.shape()[1].value()
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        lengths_type=lengths_type,
+        presences_type=presences_type,
+        index_type=backend_spec.index_type,
+        max_seq_len=max_seq_len,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.func_decl")
+def jagged_lengths_to_presences_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.jagged_lengths_to_presences.func_call")
+def jagged_lengths_to_presences_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    lengths = func_attrs["inputs"][0]
+    presences = func_attrs["outputs"][0]
+    batch_size = lengths.shape()[0]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        lengths=lengths._attrs["name"],
+        presences=presences._attrs["name"],
+        batch_size=batch_size._attrs["name"],
+    )
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
index 4525406e0..c8fd30caf 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/__init__.py
@@ -15,7 +15,7 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import (
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import (
     batch_layernorm_sigmoid_mul,
     group_layernorm_sigmoid_mul,
     layernorm_sigmoid_mul,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
index 6292898b1..f0be34b94 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/batch_layernorm_sigmoid_mul.py
@@ -21,21 +21,24 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{gamma_beta_const_defs}}
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
index 7c6b34ec4..e11eb4c28 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/group_layernorm_sigmoid_mul.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
@@ -45,10 +45,13 @@
 FUNC_TEMPLATE = jinja2.Template(
     """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{gamma_beta_const_defs}}
@@ -266,7 +269,11 @@ def group_layernorm_sigmoid_mul_gen_function_call(func_attrs, indent="  "):
 
     all_shape_funcs = []
     # all Ms are the same
-    input_0_shapes = inputs[0]._attrs["shape"]
+    if func_attrs.get("input_accessors", None):
+        input_accessor = func_attrs["input_accessors"][0]
+        input_0_shapes = input_accessor.original_shapes
+    else:
+        input_0_shapes = inputs[0]._attrs["shape"]
     norm_ndim = len(func_attrs["normalized_shape"][0])
     m_name = "M"
     m_shape_func = layernorm_common.generate_m_shape_func(
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
new file mode 100644
index 000000000..e28fdd831
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layer_norm.cuh
@@ -0,0 +1,1246 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// Original OneFlow copyright notice:
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+namespace layer_norm {
+
+constexpr int kWarpSize = 32;
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return max(a, b);
+  }
+};
+
+template <
+    template <typename>
+    class ReductionOp,
+    typename T,
+    int thread_group_width = kWarpSize>
+__inline__ __device__ T WarpAllReduce(T val) {
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    val = ReductionOp<T>()(
+        val, __shfl_xor_sync(0xffffffff, val, mask, thread_group_width));
+  }
+  return val;
+}
+
+template <template <typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) {
+    result_broadcast = result;
+  }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template <typename T>
+__inline__ __device__ T Div(T a, T b);
+
+template <>
+__inline__ __device__ float Div<float>(float a, float b) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __fdividef(a, b);
+#else
+  return a / b;
+#endif
+}
+
+template <>
+__inline__ __device__ double Div<double>(double a, double b) {
+  return a / b;
+}
+
+template <typename T>
+__inline__ __device__ T Rsqrt(T x);
+
+template <>
+__inline__ __device__ float Rsqrt<float>(float x) {
+#ifdef OF_LAYER_NORM_USE_FAST_MATH
+  return __frsqrt_rn(x);
+#else
+  return rsqrt(x);
+#endif
+}
+
+template <>
+__inline__ __device__ double Rsqrt<double>(double x) {
+  return rsqrt(x);
+}
+
+template <class Func>
+inline cudaError_t GetNumBlocks(
+    Func func,
+    int64_t block_size,
+    size_t dynamic_smem_size,
+    int64_t max_blocks,
+    int64_t waves,
+    int* num_blocks) {
+  int dev;
+  {
+    cudaError_t err = cudaGetDevice(&dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int sm_count;
+  {
+    cudaError_t err =
+        cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  int max_active_blocks;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, func, block_size, dynamic_smem_size);
+  }
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * max_active_blocks * waves));
+  return cudaSuccess;
+}
+
+template <typename T>
+class HasCanPackAs {
+  typedef char one;
+  struct two {
+    char x[2];
+  };
+
+  template <typename C>
+  static one test(decltype(&C::CanPackAs));
+  template <typename C>
+  static two test(...);
+
+ public:
+  enum { value = sizeof(test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == true, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return t.CanPackAs(pack_size);
+}
+
+template <typename T>
+typename std::enable_if<HasCanPackAs<T>::value == false, bool>::type CanPackAs(
+    T t,
+    size_t pack_size) {
+  return true;
+}
+
+template <typename T, int N>
+struct GetPackType {
+  using type =
+      typename std::aligned_storage<N * sizeof(T), N * sizeof(T)>::type;
+};
+
+template <typename T, int N>
+using PackType = typename GetPackType<T, N>::type;
+
+template <typename T, int N>
+union Pack {
+  static_assert(sizeof(PackType<T, N>) == sizeof(T) * N, "");
+  __device__ Pack() {
+    // do nothing
+  }
+  PackType<T, N> storage;
+  T elem[N];
+};
+
+template <typename SRC, typename DST>
+struct DirectLoad {
+  DirectLoad(const SRC* src, int64_t row_size) : src(src), row_size(row_size) {}
+  template <int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    Pack<SRC, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+    pack.storage = *(reinterpret_cast<const PackType<SRC, N>*>(src) + offset);
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+  const SRC* src;
+  int64_t row_size;
+};
+
+template <typename SRC, typename DST>
+struct DirectStore {
+  DirectStore(DST* dst, int64_t row_size) : dst(dst), row_size(row_size) {}
+  template <int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    Pack<DST, N> pack;
+    const int64_t offset = (row * row_size + col) / N;
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      pack.elem[i] = static_cast<DST>(src[i]);
+    }
+    *(reinterpret_cast<PackType<DST, N>*>(dst) + offset) = pack.storage;
+  }
+  DST* dst;
+  int64_t row_size;
+};
+
+template <typename T>
+inline __device__ void WelfordCombine(T val, T* mean, T* m2, T* count) {
+  // Use Welford Online algorithm to compute mean and variance
+  // For more details you can refer to:
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  *count += 1;
+  T delta1 = val - *mean;
+  *mean += Div(delta1, *count);
+  T delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template <typename T>
+inline __device__ void WelfordCombine(
+    T b_mean,
+    T b_m2,
+    T b_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  if (b_count == 0) {
+    return;
+  }
+  T new_count = *count + b_count;
+  T nb_over_n = Div(b_count, new_count);
+  T delta = b_mean - *mean;
+  *mean += delta * nb_over_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_over_n;
+  *count = new_count;
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = thread_group_width / 2; mask > 0; mask /= 2) {
+    T b_mean = __shfl_down_sync(0xffffffff, *mean, mask, thread_group_width);
+    T b_m2 = __shfl_down_sync(0xffffffff, *m2, mask, thread_group_width);
+    T b_count = __shfl_down_sync(0xffffffff, *count, mask, thread_group_width);
+    WelfordCombine(b_mean, b_m2, b_count, mean, m2, count);
+  }
+}
+
+template <typename T, int thread_group_width = kWarpSize>
+__inline__ __device__ void WelfordWarpAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* mean,
+    T* m2,
+    T* count) {
+  WelfordWarpReduce<T, thread_group_width>(
+      thread_mean, thread_m2, thread_count, mean, m2, count);
+  *mean = __shfl_sync(0xffffffff, *mean, 0, thread_group_width);
+  *m2 = __shfl_sync(0xffffffff, *m2, 0, thread_group_width);
+  *count = __shfl_sync(0xffffffff, *count, 0, thread_group_width);
+}
+
+template <typename T>
+__inline__ __device__ void WelfordBlockAllReduce(
+    T thread_mean,
+    T thread_m2,
+    T thread_count,
+    T* result_mean,
+    T* result_m2,
+    T* result_count) {
+  __shared__ T mean_shared[kWarpSize];
+  __shared__ T m2_shared[kWarpSize];
+  __shared__ T count_shared[kWarpSize];
+  __shared__ T mean_result_broadcast;
+  __shared__ T m2_result_broadcast;
+  __shared__ T count_result_broadcast;
+  const int lid = threadIdx.x % kWarpSize;
+  const int wid = threadIdx.x / kWarpSize;
+  T warp_mean = 0;
+  T warp_m2 = 0;
+  T warp_count = 0;
+  WelfordWarpReduce(
+      thread_mean, thread_m2, thread_count, &warp_mean, &warp_m2, &warp_count);
+  __syncthreads();
+  if (lid == 0) {
+    mean_shared[wid] = warp_mean;
+    m2_shared[wid] = warp_m2;
+    count_shared[wid] = warp_count;
+  }
+  __syncthreads();
+  if (wid == 0) {
+    if (threadIdx.x < blockDim.x / kWarpSize) {
+      warp_mean = mean_shared[lid];
+      warp_m2 = m2_shared[lid];
+      warp_count = count_shared[lid];
+    } else {
+      warp_mean = static_cast<T>(0);
+      warp_m2 = static_cast<T>(0);
+      warp_count = static_cast<T>(0);
+    }
+    __syncwarp();
+    T block_mean = 0;
+    T block_m2 = 0;
+    T block_count = 0;
+    WelfordWarpReduce(
+        warp_mean, warp_m2, warp_count, &block_mean, &block_m2, &block_count);
+    if (lid == 0) {
+      mean_result_broadcast = block_mean;
+      m2_result_broadcast = block_m2;
+      count_result_broadcast = block_count;
+    }
+  }
+  __syncthreads();
+  *result_mean = mean_result_broadcast;
+  *result_m2 = m2_result_broadcast;
+  *result_count = count_result_broadcast;
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+__global__ void LayerNormWarpImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  static_assert(max_cols_per_thread % pack_size == 0, "");
+  static_assert(min_cols_per_thread % pack_size == 0, "");
+  static_assert(thread_group_width <= kWarpSize, "");
+  static_assert(kWarpSize % thread_group_width == 0, "");
+  constexpr int max_num_packs = max_cols_per_thread / pack_size;
+  constexpr int min_num_packs = min_cols_per_thread / pack_size;
+  assert(cols <= max_cols_per_thread * thread_group_width);
+  ComputeType buf[rows_per_access][max_cols_per_thread];
+  const int64_t global_thread_group_id = blockIdx.x * blockDim.y + threadIdx.y;
+  const int64_t num_global_thread_group = gridDim.x * blockDim.y;
+  const int64_t lane_id = threadIdx.x;
+  const int64_t step = num_global_thread_group * rows_per_access;
+  for (int64_t row = global_thread_group_id * rows_per_access; row < rows;
+       row += step) {
+    ComputeType thread_mean[rows_per_access];
+    ComputeType thread_m2[rows_per_access];
+    ComputeType thread_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      thread_mean[row_id] = 0;
+      thread_m2[row_id] = 0;
+      thread_count[row_id] = 0;
+      ComputeType* row_buf = buf[row_id];
+#pragma unroll
+      for (int pack_id = 0; pack_id < min_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        load.template load<pack_size>(row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+        for (int i = 0; i < pack_size; ++i) {
+          WelfordCombine(
+              row_buf[pack_offset + i],
+              thread_mean + row_id,
+              thread_m2 + row_id,
+              thread_count + row_id);
+        }
+      }
+      for (int pack_id = min_num_packs; pack_id < max_num_packs; ++pack_id) {
+        const int col = (pack_id * thread_group_width + lane_id) * pack_size;
+        const int pack_offset = pack_id * pack_size;
+        if (!padding || col < cols) {
+          load.template load<pack_size>(
+              row_buf + pack_offset, row + row_id, col);
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            WelfordCombine(
+                row_buf[pack_offset + i],
+                thread_mean + row_id,
+                thread_m2 + row_id,
+                thread_count + row_id);
+          }
+        } else {
+#pragma unroll
+          for (int i = 0; i < pack_size; ++i) {
+            row_buf[pack_offset + i] = 0;
+          }
+        }
+      }
+    }
+    ComputeType warp_mean[rows_per_access];
+    ComputeType warp_m2[rows_per_access];
+    ComputeType warp_count[rows_per_access];
+#pragma unroll
+    for (int row_id = 0; row_id < rows_per_access; ++row_id) {
+      int global_row_id = row + row_id;
+      ComputeType* row_buf = buf[row_id];
+      WelfordWarpAllReduce<ComputeType, thread_group_width>(
+          thread_mean[row_id],
+          thread_m2[row_id],
+          thread_count[row_id],
+          warp_mean + row_id,
+          warp_m2 + row_id,
+          warp_count + row_id);
+      ComputeType row_mean = warp_mean[row_id];
+      ComputeType row_variance =
+          max(Div(warp_m2[row_id], warp_count[row_id]),
+              static_cast<ComputeType>(0.0));
+      ComputeType row_inv_var =
+          Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+      if (mean && inv_variance && lane_id == 0) {
+        mean[global_row_id] = row_mean;
+        inv_variance[global_row_id] = row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < max_cols_per_thread; ++i) {
+        row_buf[i] = (row_buf[i] - row_mean) * row_inv_var;
+      }
+#pragma unroll
+      for (int i = 0; i < min_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        store.template store<pack_size>(
+            row_buf + i * pack_size, global_row_id, col);
+      }
+#pragma unroll
+      for (int i = min_num_packs; i < max_num_packs; ++i) {
+        const int col = (i * thread_group_width + lane_id) * pack_size;
+        if (!padding || col < cols) {
+          store.template store<pack_size>(
+              row_buf + i * pack_size, global_row_id, col);
+        }
+      }
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access,
+    bool padding>
+inline cudaError_t LaunchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 128;
+  constexpr int waves = 32;
+  static_assert(block_size % thread_group_width == 0, "");
+  constexpr int thread_groups_per_block = block_size / thread_group_width;
+  dim3 block_dim(thread_group_width, thread_groups_per_block);
+  const int64_t num_blocks =
+      (rows / rows_per_access + thread_groups_per_block - 1) /
+      thread_groups_per_block;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormWarpImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            max_cols_per_thread,
+            min_cols_per_thread,
+            thread_group_width,
+            rows_per_access,
+            padding>,
+        block_size,
+        0,
+        num_blocks,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormWarpImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      max_cols_per_thread,
+      min_cols_per_thread,
+      thread_group_width,
+      rows_per_access,
+      padding><<<grid_dim_x, block_dim, 0, stream>>>(
+      load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int max_cols_per_thread,
+    int min_cols_per_thread,
+    int thread_group_width,
+    int rows_per_access>
+inline cudaError_t DispatchLayerNormWarpImplPadding(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols == max_cols_per_thread * thread_group_width) {
+    // when not padding, min_cols_per_thread must equals to max_cols_per_thread,
+    // pass max_cols_per_thread as min_cols_per_thread and max_cols_per_thread
+    // param.
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        max_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        false>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    return LaunchLayerNormWarpImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        max_cols_per_thread,
+        min_cols_per_thread,
+        thread_group_width,
+        rows_per_access,
+        true>(stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 1, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                 \
+  else if (cols <= (max_col)*kWarpSize) {                                 \
+    return DispatchLayerNormWarpImplPadding<                              \
+        LOAD,                                                             \
+        STORE,                                                            \
+        ComputeType,                                                      \
+        pack_size,                                                        \
+        max_col,                                                          \
+        min_col,                                                          \
+        kWarpSize,                                                        \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+  }
+  DEFINE_ONE_ELIF(2, 1)
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+typename std::enable_if<pack_size == 2, cudaError_t>::type
+DispatchLayerNormWarpImplCols(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  if (cols <= 0) {
+    return cudaErrorInvalidValue;
+  }
+#define DEFINE_ONE_ELIF(thread_group_width)                                 \
+  else if (cols <= (thread_group_width)*pack_size) {                        \
+    if (rows % 2 == 0) {                                                    \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          2>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    } else {                                                                \
+      return DispatchLayerNormWarpImplPadding<                              \
+          LOAD,                                                             \
+          STORE,                                                            \
+          ComputeType,                                                      \
+          pack_size,                                                        \
+          pack_size,                                                        \
+          0,                                                                \
+          thread_group_width,                                               \
+          1>(stream, load, store, rows, cols, epsilon, mean, inv_variance); \
+    }                                                                       \
+  }
+  DEFINE_ONE_ELIF(4)
+  DEFINE_ONE_ELIF(8)
+  DEFINE_ONE_ELIF(16)
+  DEFINE_ONE_ELIF(32)
+#undef DEFINE_ONE_ELIF
+#define DEFINE_ONE_ELIF(max_col, min_col)                                   \
+  else if ((cols <= (max_col)*kWarpSize) && (cols > (min_col)*kWarpSize)) { \
+    return DispatchLayerNormWarpImplPadding<                                \
+        LOAD,                                                               \
+        STORE,                                                              \
+        ComputeType,                                                        \
+        pack_size,                                                          \
+        max_col,                                                            \
+        min_col,                                                            \
+        kWarpSize,                                                          \
+        1>(stream, load, store, rows, cols, epsilon, mean, inv_variance);   \
+  }
+  DEFINE_ONE_ELIF(4, 2)
+  DEFINE_ONE_ELIF(8, 4)
+  DEFINE_ONE_ELIF(12, 8)
+  DEFINE_ONE_ELIF(16, 12)
+  DEFINE_ONE_ELIF(20, 16)
+  DEFINE_ONE_ELIF(24, 20)
+  DEFINE_ONE_ELIF(28, 24)
+  DEFINE_ONE_ELIF(32, 28)
+#undef DEFINE_ONE_ELIF
+  else {
+    return cudaErrorInvalidValue;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormWarpImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return DispatchLayerNormWarpImplCols<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormWarpImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormWarpImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void LayerNormBlockSMemImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  extern __shared__ __align__(sizeof(double)) unsigned char shared_buf[];
+  auto* buf = reinterpret_cast<ComputeType*>(shared_buf);
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        buf[i * num_packs + pack_id] = pack[i];
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (mean && inv_variance && threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (buf[i * num_packs + pack_id] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_id * pack_size);
+    }
+  }
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+inline cudaError_t LaunchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    int smem,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>,
+        block_size,
+        smem,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockSMemImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, smem, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t TryDispatchLayerNormBlockSMemImplBlockSize(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  constexpr int block_size_conf_1 = 128;
+  constexpr int block_size_conf_2 = 256;
+  constexpr int block_size_conf_3 = 512;
+  constexpr int block_size_conf_4 = 1024;
+  const size_t smem = cols * sizeof(ComputeType);
+  int max_active_blocks_conf_1;
+
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_1,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_1>,
+        block_size_conf_1,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  if (max_active_blocks_conf_1 <= 0) {
+    *success = false;
+    return cudaSuccess;
+  }
+  int max_active_blocks_conf_4;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_4,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_4>,
+        block_size_conf_4,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_4 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_4>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_3;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_3,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_3>,
+        block_size_conf_3,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_3 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_3>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  int max_active_blocks_conf_2;
+  {
+    cudaError_t err = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks_conf_2,
+        LayerNormBlockSMemImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size_conf_2>,
+        block_size_conf_2,
+        smem);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+
+  if (max_active_blocks_conf_2 == max_active_blocks_conf_1) {
+    *success = true;
+    return LaunchLayerNormBlockSMemImpl<
+        LOAD,
+        STORE,
+        ComputeType,
+        pack_size,
+        block_size_conf_2>(
+        stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+  }
+  *success = true;
+  return LaunchLayerNormBlockSMemImpl<
+      LOAD,
+      STORE,
+      ComputeType,
+      pack_size,
+      block_size_conf_1>(
+      stream, load, store, smem, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct TryDispatchLayerNormBlockSMemImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance,
+      bool* success) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          4>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          2>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    } else {
+      return TryDispatchLayerNormBlockSMemImplBlockSize<
+          LOAD,
+          STORE,
+          ComputeType,
+          1>(
+          stream,
+          load,
+          store,
+          rows,
+          cols,
+          epsilon,
+          mean,
+          inv_variance,
+          success);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t TryDispatchLayerNormBlockSMemImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance,
+    bool* success) {
+  return TryDispatchLayerNormBlockSMemImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance, success);
+}
+
+template <
+    typename LOAD,
+    typename STORE,
+    typename ComputeType,
+    int pack_size,
+    int block_size>
+__global__ void __launch_bounds__(1024) LayerNormBlockUncachedImpl(
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  const int tid = threadIdx.x;
+  assert(cols % pack_size == 0);
+  const int num_packs = static_cast<int>(cols) / pack_size;
+  for (int64_t row = blockIdx.x; row < rows; row += gridDim.x) {
+    ComputeType thread_mean = 0;
+    ComputeType thread_m2 = 0;
+    ComputeType thread_count = 0;
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      load.template load<pack_size>(pack, row, pack_id * pack_size);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        WelfordCombine(pack[i], &thread_mean, &thread_m2, &thread_count);
+      }
+    }
+    ComputeType row_mean = 0;
+    ComputeType row_m2 = 0;
+    ComputeType row_count = 0;
+    WelfordBlockAllReduce<ComputeType>(
+        thread_mean, thread_m2, thread_count, &row_mean, &row_m2, &row_count);
+    ComputeType row_variance =
+        max(Div(row_m2, row_count), static_cast<ComputeType>(0.0));
+    ComputeType row_inv_var =
+        Rsqrt(row_variance + static_cast<ComputeType>(epsilon));
+    if (mean && inv_variance && threadIdx.x == 0) {
+      mean[row] = row_mean;
+      inv_variance[row] = row_inv_var;
+    }
+    for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
+      ComputeType pack[pack_size];
+      const int pack_offset = pack_id * pack_size;
+      load.template load<pack_size>(pack, row, pack_offset);
+#pragma unroll
+      for (int i = 0; i < pack_size; ++i) {
+        pack[i] = (pack[i] - row_mean) * row_inv_var;
+      }
+      store.template store<pack_size>(pack, row, pack_offset);
+    }
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType, int pack_size>
+inline cudaError_t LaunchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  constexpr int block_size = 1024;
+  constexpr int waves = 32;
+  int grid_dim_x;
+  {
+    cudaError_t err = GetNumBlocks(
+        LayerNormBlockUncachedImpl<
+            LOAD,
+            STORE,
+            ComputeType,
+            pack_size,
+            block_size>,
+        block_size,
+        0,
+        rows,
+        waves,
+        &grid_dim_x);
+    if (err != cudaSuccess) {
+      return err;
+    }
+  }
+  LayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, pack_size, block_size>
+      <<<grid_dim_x, block_size, 0, stream>>>(
+          load, store, rows, cols, epsilon, mean, inv_variance);
+  return cudaPeekAtLastError();
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+struct DispatchLayerNormBlockUncachedImplPackSize {
+  cudaError_t operator()(
+      cudaStream_t stream,
+      LOAD load,
+      STORE store,
+      const int64_t rows,
+      const int64_t cols,
+      const double epsilon,
+      ComputeType* mean,
+      ComputeType* inv_variance) {
+    if (cols % 4 == 0 && CanPackAs<LOAD>(load, 4) &&
+        CanPackAs<STORE>(store, 4)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 4>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else if (
+        cols % 2 == 0 && CanPackAs<LOAD>(load, 2) &&
+        CanPackAs<STORE>(store, 2)) {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 2>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    } else {
+      return LaunchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType, 1>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+  }
+};
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline cudaError_t DispatchLayerNormBlockUncachedImpl(
+    cudaStream_t stream,
+    LOAD load,
+    STORE store,
+    const int64_t rows,
+    const int64_t cols,
+    const double epsilon,
+    ComputeType* mean,
+    ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImplPackSize<LOAD, STORE, ComputeType>()(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<!std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  if (cols <= 1024) {
+    return DispatchLayerNormWarpImpl<LOAD, STORE, ComputeType>(
+        stream, load, store, rows, cols, epsilon, mean, inv_variance);
+  } else {
+    bool dispatch_smem_impl_success;
+    {
+      cudaError_t err =
+          TryDispatchLayerNormBlockSMemImpl<LOAD, STORE, ComputeType>(
+              stream,
+              load,
+              store,
+              rows,
+              cols,
+              epsilon,
+              mean,
+              inv_variance,
+              &dispatch_smem_impl_success);
+      if (err != cudaSuccess) {
+        return err;
+      }
+    }
+    if (!dispatch_smem_impl_success) {
+      return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+          stream, load, store, rows, cols, epsilon, mean, inv_variance);
+    }
+    return cudaSuccess;
+  }
+}
+
+template <typename LOAD, typename STORE, typename ComputeType>
+inline typename std::
+    enable_if<std::is_same<ComputeType, double>::value, cudaError_t>::type
+    DispatchLayerNorm(
+        cudaStream_t stream,
+        LOAD load,
+        STORE store,
+        const int64_t rows,
+        const int64_t cols,
+        const double epsilon,
+        ComputeType* mean,
+        ComputeType* inv_variance) {
+  return DispatchLayerNormBlockUncachedImpl<LOAD, STORE, ComputeType>(
+      stream, load, store, rows, cols, epsilon, mean, inv_variance);
+}
+
+// gradient kernels are omitted
+
+} // namespace layer_norm
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
index 99140521f..84882affd 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul.py
@@ -21,20 +21,31 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
-from ...target import Target
-from . import layernorm_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.cuda.layernorm_sigmoid_mul import layernorm_common
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0301
 
 FUNC_TEMPLATE = jinja2.Template(
     """
+#include <cuda.h>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+
+#include <cub/cub.cuh>
+#include "cutlass/arch/memory_sm80.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include "logging.h"
+#include <math_constants.h>
+#include <assert.h>
+
+using bfloat16 = __nv_bfloat16;
+using bfloat16_2 = __nv_bfloat162;
 
 {{gamma_beta_const_defs}}
 
@@ -91,6 +102,30 @@
 )
 
 
+def _get_custom_libs():
+    target = Target.current()
+    if target._kwargs.get("layernorm_use_welford_algorithm", False):
+        custom_libs = "\n\n".join(
+            [
+                target.get_custom_libs(
+                    absolute_dir=os.path.dirname(__file__),
+                    filename="layer_norm.cuh",
+                ),
+                target.get_custom_libs(
+                    absolute_dir=os.path.dirname(__file__),
+                    filename="layernorm_welford.cuh",
+                ),
+            ]
+        )
+    else:
+        custom_libs = target.get_custom_libs(
+            absolute_dir=os.path.dirname(__file__),
+            filename="layernorm_sigmoid_mul_kernel.cuh",
+        )
+
+    return custom_libs
+
+
 @registry.reg("cuda.layernorm.gen_function")
 def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     gamma_beta_const_defs = layernorm_common.gamma_beta_const_defs(func_attrs)
@@ -98,10 +133,9 @@ def layernorm_gen_function(func_attrs: Dict[str, Any]) -> str:
     elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
+
     return FUNC_TEMPLATE.render(
-        custom_libs=Target.current().get_custom_libs(
-            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
-        ),
+        custom_libs=_get_custom_libs(),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
@@ -124,9 +158,7 @@ def layernorm_sigmoid_mul_gen_function(func_attrs: Dict[str, Any]) -> str:
         func_attrs["inputs"][0]._attrs["dtype"]
     )
     return FUNC_TEMPLATE.render(
-        custom_libs=Target.current().get_custom_libs(
-            os.path.dirname(__file__), "layernorm_sigmoid_mul_kernel.cuh"
-        ),
+        custom_libs=_get_custom_libs(),
         tensor_accessor_libs=tensor_accessor_codegen.get_libs(),
         func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
         elem_input_type=elem_input_type,
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
index f91f6dc16..1b562d124 100644
--- a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_sigmoid_mul_kernel.cuh
@@ -40,6 +40,10 @@ struct half4 {
   half x, y, z, w;
 };
 
+struct bfloat16_4 {
+  bfloat16 x, y, z, w;
+};
+
 template <typename T, int NUM>
 __inline__ __device__ T warpReduceSum(T* val) {
 #pragma unroll
@@ -201,7 +205,7 @@ __global__ void layernorm_sigmoid_mul_stored_locally(
   float local_sums[1] = {0.0f};
   if (tid < quarter_n) {
     local_val =
-        *input_accessor.get<const float4, const float4>(input, offset + tid);
+        *input_accessor.get<const float, const float4>(input, offset + tid);
 
     local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
   }
@@ -407,6 +411,136 @@ __global__ void layernorm_sigmoid_mul_stored_locally(
   }
 }
 
+// output [m, n] row-major
+// input [m, n] row-major
+// gamma [n]
+// beta [n]
+// grid [m]
+// block [block_size] -- each threadblock deals with block_size elements;
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void layernorm_sigmoid_mul_stored_locally(
+    bfloat16_4* output,
+    const bfloat16_4* input,
+    const bfloat16_4* gamma,
+    const bfloat16_4* beta,
+    const int n,
+    const float eps,
+    TensorAccessor input_accessor,
+    TensorAccessor output_accessor) {
+  const uint64_t m_idx = blockIdx.x;
+  const uint64_t tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const uint64_t quarter_n = n >> 2;
+  const uint64_t offset = m_idx * quarter_n;
+
+  float local_sums[1] = {0.0f};
+  bfloat16_4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+        input, offset + tid);
+
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const bfloat16_4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const bfloat16_4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2bfloat16_rn(local_val.x);
+    local_val_half.y = __float2bfloat16_rn(local_val.y);
+    local_val_half.z = __float2bfloat16_rn(local_val.z);
+    local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+    *(output_accessor.get<bfloat16, bfloat16_4>(output, offset + tid)) =
+        local_val_half;
+  }
+}
+
 // output [m, n] row-major
 // input [m, n] row-major
 // gamma [n]
@@ -466,8 +600,16 @@ __global__ void layernorm_sigmoid_mul(
   __syncthreads();
 
   for (int i = tid; i < n; i += blockDim.x) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
     const float gamma_val = static_cast<float>(gamma[i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
     const float beta_val = static_cast<float>(beta[i]);
+#endif // AIT_LAYERNORM_CONST_BETA
     float local_val = static_cast<float>(
         *input_accessor.get<const T, const T>(input, offset + i));
 
@@ -585,7 +727,7 @@ cudaError_t invokeLayernormSigmoidMul(
       input_accessor.is_valid_alignment(4) &&
       output_accessor.is_valid_alignment(4)) {
     block.x = (block.x / 4 + 31) / 32 * 32;
-    if constexpr (std::is_same<T, float>::value) {
+    if constexpr (std::is_same_v<T, float>) {
       layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (float4*)output,
@@ -597,7 +739,7 @@ cudaError_t invokeLayernormSigmoidMul(
               input_accessor,
               output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
-    } else {
+    } else if constexpr (std::is_same_v<T, half>) {
       layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (half4*)output,
@@ -609,6 +751,22 @@ cudaError_t invokeLayernormSigmoidMul(
               input_accessor,
               output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else if constexpr (std::is_same_v<T, bfloat16>) {
+      layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (bfloat16_4*)output,
+              (const bfloat16_4*)input,
+              (const bfloat16_4*)gamma,
+              (const bfloat16_4*)beta,
+              n,
+              eps,
+              input_accessor,
+              output_accessor);
+      LAYER_NORM_CUDA_CHECK_LAUNCH();
+    } else {
+      static_assert(
+          std::is_same_v<T, half> || std::is_same_v<T, float> ||
+          std::is_same_v<T, bfloat16>);
     }
   } else if (n < 1024) {
     block.x = (block.x + 31) / 32 * 32;
@@ -628,14 +786,7 @@ cudaError_t invokeLayernormSigmoidMul(
     block.x = 512;
     if constexpr (std::is_same<T, half>::value) {
       layernorm_sigmoid_mul<FuseSigmoidMul><<<grid, block, 0, stream>>>(
-          (half*)(output),
-          (const half*)(input),
-          (const half*)(gamma),
-          (const half*)(beta),
-          n,
-          eps,
-          input_accessor,
-          output_accessor);
+          output, input, gamma, beta, n, eps, input_accessor, output_accessor);
       LAYER_NORM_CUDA_CHECK_LAUNCH();
     } else {
       layernorm_sigmoid_mul<T, T_ACC, FuseSigmoidMul>
@@ -877,6 +1028,140 @@ __global__ void batch_layernorm_sigmoid_mul_stored_locally(
   }
 }
 
+// output [b, m, n] row-major
+// input [b, m, n] row-major
+// gamma [b, n]
+// beta [b, n]
+// grid(b, m)
+// block(block_size) -- each threadblock deals with block_size elements
+// block_size = n / 4
+// block_size: round up to multiples of 32
+template <bool FuseSigmoidMul>
+__global__ void batch_layernorm_sigmoid_mul_stored_locally(
+    bfloat16_4* output,
+    const bfloat16_4* input,
+    const bfloat16_4* gamma,
+    const bfloat16_4* beta,
+    const int m,
+    const int n,
+    const float eps) {
+  const int b_idx = blockIdx.x;
+  const int m_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+
+  const int quarter_n = n >> 2;
+  const int offset = (m_idx + b_idx * m) * quarter_n;
+  const int gamma_beta_offset = b_idx * quarter_n;
+
+  input += offset;
+  output += offset;
+
+  gamma += gamma_beta_offset;
+  beta += gamma_beta_offset;
+
+  float local_sums[1] = {0.0f};
+  bfloat16_4 local_val_half{0.0f, 0.0f, 0.0f, 0.0f};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  if (tid < quarter_n) {
+    local_val_half = input[tid];
+    local_val = {
+        static_cast<float>(local_val_half.x),
+        static_cast<float>(local_val_half.y),
+        static_cast<float>(local_val_half.z),
+        static_cast<float>(local_val_half.w)};
+    local_sums[0] = local_val.x + local_val.y + local_val.z + local_val.w;
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+
+  local_sums[0] = 0.0f;
+  if (tid < quarter_n) {
+    local_sums[0] = (local_val.x - s_mean) * (local_val.x - s_mean) +
+        (local_val.y - s_mean) * (local_val.y - s_mean) +
+        (local_val.z - s_mean) * (local_val.z - s_mean) +
+        (local_val.w - s_mean) * (local_val.w - s_mean);
+  }
+
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + eps);
+  }
+  __syncthreads();
+
+  if (tid < quarter_n) {
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+    const float4 gamma_val = {
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA,
+        AIT_LAYERNORM_CONST_GAMMA};
+#else
+    const bfloat16_4 gamma_val_half = gamma[tid];
+    const float4 gamma_val = {
+        static_cast<float>(gamma_val_half.x),
+        static_cast<float>(gamma_val_half.y),
+        static_cast<float>(gamma_val_half.z),
+        static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+    const float4 beta_val = {
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA,
+        AIT_LAYERNORM_CONST_BETA};
+#else
+    const bfloat16_4 beta_val_half = beta[tid];
+    const float4 beta_val = {
+        static_cast<float>(beta_val_half.x),
+        static_cast<float>(beta_val_half.y),
+        static_cast<float>(beta_val_half.z),
+        static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+    if (FuseSigmoidMul) {
+      local_val.x *= sigmoid(
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+      local_val.y *= sigmoid(
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+      local_val.z *= sigmoid(
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+      local_val.w *= sigmoid(
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+    } else {
+      local_val.x =
+          normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+      local_val.y =
+          normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+      local_val.z =
+          normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+      local_val.w =
+          normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+    }
+
+    local_val_half.x = __float2bfloat16_rn(local_val.x);
+    local_val_half.y = __float2bfloat16_rn(local_val.y);
+    local_val_half.z = __float2bfloat16_rn(local_val.z);
+    local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+    output[tid] = local_val_half;
+  }
+}
+
 // output [b, m, n] row-major
 // input [b, m, n] row-major
 // gamma [b, n]
@@ -1196,7 +1481,7 @@ void invokeBatchLayernormSigmoidMul(
   dim3 block(n);
   if ((n % 4 == 0) && (n >= 128) && (n <= 4096)) {
     block.x = (block.x / 4 + 31) / 32 * 32;
-    if (std::is_same<T, float>::value) {
+    if constexpr (std::is_same<T, float>::value) {
       batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (float4*)output,
@@ -1206,7 +1491,7 @@ void invokeBatchLayernormSigmoidMul(
               m,
               n,
               eps);
-    } else {
+    } else if constexpr (std::is_same<T, half>::value) {
       batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
           <<<grid, block, 0, stream>>>(
               (half4*)output,
@@ -1216,6 +1501,20 @@ void invokeBatchLayernormSigmoidMul(
               m,
               n,
               eps);
+    } else if constexpr (std::is_same<T, bfloat16>::value) {
+      batch_layernorm_sigmoid_mul_stored_locally<FuseSigmoidMul>
+          <<<grid, block, 0, stream>>>(
+              (bfloat16_4*)output,
+              (const bfloat16_4*)input,
+              (const bfloat16_4*)gamma,
+              (const bfloat16_4*)beta,
+              m,
+              n,
+              eps);
+    } else {
+      static_assert(
+          std::is_same_v<T, half> || std::is_same_v<T, float> ||
+          std::is_same_v<T, bfloat16>);
     }
   } else if (n < 1024) {
     block.x = (block.x + 31) / 32 * 32;
@@ -1258,14 +1557,14 @@ struct Arguments {
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with 4 elements
 // block_size = n / 4
 template <bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const Arguments<half4, float, NumInputs>& args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
   float local_sums[1] = {0.0f};
@@ -1410,6 +1709,162 @@ __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
   }
 }
 
+// output b * [m, n] row-major
+// input  b * [m, n] row-major
+// gamma b * [n]
+// beta  b * [n]
+// grid [m, b]
+// block [block_size] -- each thread deals with 4 elements
+// block_size = n / 4
+template <bool FuseSigmoidMul, int NumInputs>
+__device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
+    const Arguments<bfloat16_4, float, NumInputs>& args) {
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
+  const int tid = threadIdx.x;
+  __shared__ float s_mean, s_variance;
+  float local_sums[1] = {0.0f};
+
+  bfloat16_4* output = args.outputs[b_idx];
+  const bfloat16_4* input = args.inputs[b_idx];
+  const bfloat16_4* gamma = args.gammas[b_idx];
+  const bfloat16_4* beta = args.betas[b_idx];
+  const TensorAccessor& input_accessor = args.input_accessors[b_idx];
+  const TensorAccessor& output_accessor = args.output_accessors[b_idx];
+
+  const int n = args.N[b_idx];
+  const int quarter_n = n >> 2;
+  const int offset = m_idx * quarter_n;
+
+  const int block_size = blockDim.x;
+  const int num_iters =
+      ceil(static_cast<float>(quarter_n) / static_cast<float>(block_size));
+
+  bfloat16_4 local_val_half{0, 0, 0, 0};
+  float4 local_val{0.0f, 0.0f, 0.0f, 0.0f};
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += local_val.x + local_val.y + local_val.z + local_val.w;
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_mean = local_sums[0] / n;
+  }
+  __syncthreads();
+  local_sums[0] = 0.0f;
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+      local_sums[0] += (local_val.x - s_mean) * (local_val.x - s_mean) +
+          (local_val.y - s_mean) * (local_val.y - s_mean) +
+          (local_val.z - s_mean) * (local_val.z - s_mean) +
+          (local_val.w - s_mean) * (local_val.w - s_mean);
+    }
+  }
+  if (blockDim.x <= 32) {
+    warpReduceSum<float, 1>(local_sums);
+  } else {
+    blockReduceSum<float, 1>(local_sums);
+  }
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(local_sums[0] / n + args.eps);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_iters; ++i) {
+    int elem_no = tid + block_size * i;
+    if (elem_no < quarter_n) {
+      local_val_half = *input_accessor.get<const bfloat16, const bfloat16_4>(
+          input, offset + elem_no);
+      local_val = {
+          static_cast<float>(local_val_half.x),
+          static_cast<float>(local_val_half.y),
+          static_cast<float>(local_val_half.z),
+          static_cast<float>(local_val_half.w)};
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+      const float4 gamma_val = {
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA,
+          AIT_LAYERNORM_CONST_GAMMA};
+#else
+      const bfloat16_4 gamma_val_half = gamma[elem_no];
+      const float4 gamma_val = {
+          static_cast<float>(gamma_val_half.x),
+          static_cast<float>(gamma_val_half.y),
+          static_cast<float>(gamma_val_half.z),
+          static_cast<float>(gamma_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+      const float4 beta_val = {
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA,
+          AIT_LAYERNORM_CONST_BETA};
+#else
+      const bfloat16_4 beta_val_half = beta[elem_no];
+      const float4 beta_val = {
+          static_cast<float>(beta_val_half.x),
+          static_cast<float>(beta_val_half.y),
+          static_cast<float>(beta_val_half.z),
+          static_cast<float>(beta_val_half.w)};
+#endif // AIT_LAYERNORM_CONST_BETA
+
+      if constexpr (FuseSigmoidMul) {
+        local_val.x *= sigmoid(normalize(
+            local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x));
+        local_val.y *= sigmoid(normalize(
+            local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y));
+        local_val.z *= sigmoid(normalize(
+            local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z));
+        local_val.w *= sigmoid(normalize(
+            local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w));
+      } else {
+        local_val.x =
+            normalize(local_val.x, s_mean, s_variance, gamma_val.x, beta_val.x);
+        local_val.y =
+            normalize(local_val.y, s_mean, s_variance, gamma_val.y, beta_val.y);
+        local_val.z =
+            normalize(local_val.z, s_mean, s_variance, gamma_val.z, beta_val.z);
+        local_val.w =
+            normalize(local_val.w, s_mean, s_variance, gamma_val.w, beta_val.w);
+      }
+
+      local_val_half.x = __float2bfloat16_rn(local_val.x);
+      local_val_half.y = __float2bfloat16_rn(local_val.y);
+      local_val_half.z = __float2bfloat16_rn(local_val.z);
+      local_val_half.w = __float2bfloat16_rn(local_val.w);
+
+      *(output_accessor.get<bfloat16, bfloat16_4>(output, offset + elem_no)) =
+          local_val_half;
+    }
+  }
+}
+
 #define GROUP_LAYER_NORM_MAX_INLINE_INPUTS 39
 
 template <
@@ -1434,18 +1889,40 @@ __global__ void group_layernorm_sigmoid_mul_stored_locally_half(
       *args);
 }
 
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<NumInputs <= GROUP_LAYER_NORM_MAX_INLINE_INPUTS, bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_bfloat16(
+    Arguments<bfloat16_4, float, NumInputs> args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      args);
+}
+
+template <
+    bool FuseSigmoidMul,
+    int NumInputs,
+    std::enable_if_t<(NumInputs > GROUP_LAYER_NORM_MAX_INLINE_INPUTS), bool> =
+        true>
+__global__ void group_layernorm_sigmoid_mul_stored_locally_bfloat16(
+    const Arguments<bfloat16_4, float, NumInputs>* args) {
+  group_layernorm_sigmoid_mul_stored_locally_impl<FuseSigmoidMul, NumInputs>(
+      *args);
+}
+
 // output b * [m, n] row-major
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with 1 element
 // block_size = n
 template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_stored_locally_impl(
     const Arguments<T, T_ACC, NumInputs>& args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
 
@@ -1551,14 +2028,14 @@ __global__ void group_layernorm_sigmoid_mul_stored_locally(
 // input  b * [m, n] row-major
 // gamma b * [n]
 // beta  b * [n]
-// grid [b, m]
+// grid [m, b]
 // block [block_size] -- each thread deals with n / block_size element
 // block_size = 512
 template <typename T, typename T_ACC, bool FuseSigmoidMul, int NumInputs>
 __device__ void group_layernorm_sigmoid_mul_impl(
     Arguments<T, T_ACC, NumInputs> args) {
-  const int b_idx = blockIdx.x;
-  const int m_idx = blockIdx.y;
+  const int m_idx = blockIdx.x;
+  const int b_idx = blockIdx.y;
   const int tid = threadIdx.x;
   __shared__ float s_mean, s_variance;
 
@@ -1665,10 +2142,19 @@ cudaError_t invokeGroupLayernormSigmoidMul(
     return cudaSuccess;
   }
 
-  dim3 grid(b, m);
+  bool accessors_aligned_to_4 = true;
+  for (size_t i = 0; i < b; ++i) {
+    if (!input_accessors[i].is_valid_alignment(4) ||
+        !output_accessors[i].is_valid_alignment(4)) {
+      accessors_aligned_to_4 = false;
+      break;
+    }
+  }
+
+  dim3 grid(m, b);
   // TODO: implement float4 group kernel
   if (std::is_same<T, half>::value && n_is_multiple_of_4 && (min_n >= 128) &&
-      (max_n <= 4096)) {
+      (max_n <= 4096) && accessors_aligned_to_4) {
     dim3 block(min_n);
     // round up to multiples of 32 to make warp shuffles safe
     block.x = (block.x / 4 + 31) / 32 * 32;
diff --git a/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh
new file mode 100644
index 000000000..29e7be6c6
--- /dev/null
+++ b/python/aitemplate/backend/cuda/layernorm_sigmoid_mul/layernorm_welford.cuh
@@ -0,0 +1,284 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef LAYERNORM_KERNEL_CUH
+#define LAYERNORM_KERNEL_CUH
+
+constexpr uint32_t kFinalMask = 0xffffffff;
+
+#ifndef __HALF_TO_US
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
+__device__ half fast_tanh(half x) {
+#if defined(AIT_USE_FAST_MATH)
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#else
+  return half(cutlass::fast_tanh(float(x)));
+#endif
+#else
+  return half(tanhf(float(x)));
+#endif
+}
+
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
+  asm volatile("tanh.approx.bf16 %0, %1;"
+               : "=h"(__HALF_TO_US(x))
+               : "h"(__HALF_TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
+  return cutlass::fast_tanh(float(x));
+#else
+  return bfloat16(tanhf(float(x)));
+#endif
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+
+__device__ float sigmoid(const float a) {
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
+}
+
+__device__ half hsigmoid(const half a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+__device__ bfloat16 bf16sigmoid(const bfloat16 a) {
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
+}
+#endif
+
+template <typename T>
+struct FSigmoid {
+  __inline__ __device__ T operator()(const T input) const;
+};
+
+template <>
+struct FSigmoid<half> {
+  __inline__ __device__ half operator()(const half a) const {
+    return hsigmoid(a);
+  }
+};
+
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 800)
+template <>
+struct FSigmoid<bfloat16> {
+  __inline__ __device__ bfloat16 operator()(const bfloat16 a) const {
+    return bf16sigmoid(a);
+  }
+};
+#endif
+
+template <>
+struct FSigmoid<float> {
+  __inline__ __device__ float operator()(const float a) const {
+    return sigmoid(a);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// The Layernorm implementation below is based on OneFlow's Layernorm
+// implementation at:
+// https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh
+
+/*
+Copyright 2020 The OneFlow Authors. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+template <typename SRC, typename DST>
+struct TensorAccessorLoad {
+  TensorAccessorLoad(
+      const SRC* src,
+      int64_t row_size,
+      const TensorAccessor input_accessor)
+      : src(src), row_size(row_size), input_accessor(input_accessor) {}
+
+  template <int N>
+  __device__ void load(DST* dst, int64_t row, int64_t col) const {
+    layer_norm::Pack<SRC, N> pack;
+    pack.storage =
+        *input_accessor.get<const SRC, const layer_norm::PackType<SRC, N>>(
+            reinterpret_cast<const layer_norm::PackType<SRC, N>*>(src),
+            (row * row_size + col) / N);
+
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      dst[i] = static_cast<DST>(pack.elem[i]);
+    }
+  }
+
+  bool CanPackAs(size_t pack_size) {
+    return row_size % pack_size == 0 &&
+        input_accessor.max_alignment() % pack_size == 0;
+  }
+
+  const SRC* src;
+  int64_t row_size;
+  const TensorAccessor input_accessor;
+};
+
+template <typename SRC, typename DST, bool FuseSigmoidMul>
+struct TensorAccessorStore {
+  TensorAccessorStore(
+      DST* y,
+      const DST* x,
+      const DST* gamma,
+      const DST* beta,
+      int64_t row_size,
+      const TensorAccessor input_accessor,
+      const TensorAccessor output_accessor)
+      : y(y),
+        x(x),
+        gamma(gamma),
+        beta(beta),
+        row_size(row_size),
+        input_accessor(input_accessor),
+        output_accessor(output_accessor) {}
+
+  template <int N>
+  __device__ void store(const SRC* src, int64_t row, int64_t col) {
+    layer_norm::Pack<DST, N> x_pack;
+    layer_norm::Pack<DST, N> y_pack;
+
+    if constexpr (FuseSigmoidMul) {
+      x_pack.storage =
+          *input_accessor.get<const DST, const layer_norm::PackType<DST, N>>(
+              reinterpret_cast<const layer_norm::PackType<DST, N>*>(x),
+              (row * row_size + col) / N);
+    }
+
+#pragma unroll
+    for (int i = 0; i < N; ++i) {
+      SRC normalized_i = src[i];
+
+#ifdef AIT_LAYERNORM_CONST_GAMMA
+      const SRC gamma_val = AIT_LAYERNORM_CONST_GAMMA;
+#else
+      const SRC gamma_val = static_cast<SRC>(gamma[col + i]);
+#endif // AIT_LAYERNORM_CONST_GAMMA
+
+#ifdef AIT_LAYERNORM_CONST_BETA
+      const SRC beta_val = AIT_LAYERNORM_CONST_BETA;
+#else
+      const SRC beta_val = static_cast<SRC>(beta[col + i]);
+#endif // AIT_LAYERNORM_CONST_BETA
+
+      normalized_i = normalized_i * gamma_val + beta_val;
+
+      if constexpr (FuseSigmoidMul) {
+        FSigmoid<SRC> fsigmoid;
+        normalized_i =
+            static_cast<SRC>(x_pack.elem[i]) * fsigmoid(normalized_i);
+      }
+
+      y_pack.elem[i] = DST(normalized_i);
+    }
+
+    *output_accessor.get<DST, layer_norm::PackType<DST, N>>(
+        reinterpret_cast<layer_norm::PackType<DST, N>*>(y),
+        (row * row_size + col) / N) = y_pack.storage;
+  }
+
+  bool CanPackAs(size_t pack_size) {
+    return row_size % pack_size == 0 &&
+        output_accessor.max_alignment() % pack_size == 0;
+  }
+
+  DST* y;
+  const DST* x;
+  const DST* gamma;
+  const DST* beta;
+  int64_t row_size;
+  const TensorAccessor input_accessor;
+  const TensorAccessor output_accessor;
+};
+
+template <typename TInput, typename TCompute, bool FuseSigmoidMul>
+cudaError_t invokeLayernormSigmoidMul(
+    TInput* output,
+    const TInput* input,
+    const TInput* gamma,
+    const TInput* beta,
+    int m,
+    int n,
+    const float eps,
+    cudaStream_t stream,
+    const TensorAccessor& input_accessor,
+    const TensorAccessor& output_accessor) {
+  TensorAccessorLoad<TInput, TCompute> load(input, n, input_accessor);
+  TensorAccessorStore<TCompute, TInput, FuseSigmoidMul> store(
+      output, input, gamma, beta, n, input_accessor, output_accessor);
+
+  // mean and inv_variance are not required for forward pass, hence omitted
+  layer_norm::DispatchLayerNorm<decltype(load), decltype(store), TCompute>(
+      stream,
+      load,
+      store,
+      m /* rows */,
+      n /* cols */,
+      eps /* epsilon */,
+      nullptr /* mean */,
+      nullptr /* inv_variance */);
+
+  return cudaGetLastError();
+}
+
+#endif /* LAYERNORM_KERNEL_CUH */
diff --git a/python/aitemplate/backend/cuda/lib_template.py b/python/aitemplate/backend/cuda/lib_template.py
index 67d6d76b9..56cd310bf 100644
--- a/python/aitemplate/backend/cuda/lib_template.py
+++ b/python/aitemplate/backend/cuda/lib_template.py
@@ -17,7 +17,8 @@
 """
 import jinja2
 
-from .. import registry
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
@@ -36,3 +37,9 @@ def void_ptr_decl(name, dtype="float16", indent="  "):
     # FIXME: we keep dtype in void_ptr_decl's param list because rocm needs it.
     # We will remove it once we support general tensor type for rocm
     return PTR_TEMPLATE.render(name=name, dtype="void*", indent=indent)
+
+
+@registry.reg("cuda.lib.dtype_to_backend_type")
+def dtype_to_backend_type(dtype):
+    backend_spec = CUDASpec()
+    return backend_spec.dtype_to_backend_type(dtype)
diff --git a/python/aitemplate/backend/cuda/padding/__init__.py b/python/aitemplate/backend/cuda/padding/__init__.py
index 455e327d6..807b81bc4 100644
--- a/python/aitemplate/backend/cuda/padding/__init__.py
+++ b/python/aitemplate/backend/cuda/padding/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA padding init
 """
-from . import nhwc3to4, nhwc3to8, pad_last_dim
+from aitemplate.backend.cuda.padding import ndhwc3to8, nhwc3to4, nhwc3to8, pad_last_dim
 
-__all__ = ["nhwc3to8", "pad_last_dim", "nhwc3to4"]
+__all__ = ["ndhwc3to8", "nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/cuda/padding/ndhwc3to8.py b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
new file mode 100644
index 000000000..bb03c0b16
--- /dev/null
+++ b/python/aitemplate/backend/cuda/padding/ndhwc3to8.py
@@ -0,0 +1,252 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for ndhwc3to8 op
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  cudaStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_d}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_d}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}ndhwc3to8_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    DI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+
+// load 128 bit every time (n ElemT = 4 float)
+// use as many as thread with factor of 3:
+// each time load num_thread * n ElemT = num_thread / 3 * n ElemT * 3ch ->
+// num_thread / 3 * n ElemT * n ElemT ch
+
+template<typename ElemT, int num_thread>
+__global__ void ndhwc3to8_kernel(const float4* input,
+                                float4* output,
+                                const int NI,
+                                const int DI,
+                                const int HI,
+                                const int WI,
+                                const int max_in_elements,
+                                const int max_out_elements) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  __shared__ float4 shared_mem[num_thread];
+  const int out_offset = num_thread * num_elem_t_in_float4 / 3;
+  const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  const ElemT zero = static_cast<ElemT>(0.f);
+  const int in_idx = blockIdx.x * num_thread + threadIdx.x;
+  const int tid = threadIdx.x;
+
+  shared_mem[tid] = in_idx >= max_in_elements ? zero4 : __ldg(input + in_idx);
+  __syncthreads();
+
+  const int out_start_idx = blockIdx.x * out_offset;
+  const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
+  for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
+    const ElemT* smem_element = (const ElemT*)shared_mem + j * 3;
+    ElemT tmp[num_elem_t_in_float4];
+
+    #pragma unroll
+    for (int k = 0; k < num_elem_t_in_float4; ++k) {
+      tmp[k] = k < 3 ? smem_element[k] : zero;
+    }
+    output[i] = *((const float4*)tmp);
+  }
+}
+
+template <typename ElemT>
+void ndhwc3to8_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int DI,
+                       int HI,
+                       int WI,
+                       cudaStream_t stream) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  constexpr int nthread = 240;
+  const int NDHW = NI * DI * HI * WI;
+  if (NDHW % num_elem_t_in_float4 != 0) {
+    throw std::runtime_error(
+        "NDHW (" + std::to_string(NDHW) + ") mod num_elem_t_in_float4 (" +
+        std::to_string(num_elem_t_in_float4) + ") is not 0"
+    );
+  }
+  static_assert(nthread % 3 == 0);
+  const int max_in_elements = NDHW * 3 / num_elem_t_in_float4;
+  const int max_out_elements = NDHW * num_elem_t_in_float4 / num_elem_t_in_float4;
+  dim3 thread_block(nthread);
+  dim3 grid((NDHW * 3 + nthread * num_elem_t_in_float4 -1) / (nthread * num_elem_t_in_float4));
+  ndhwc3to8_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)in_ptr,
+    (float4*) out_ptr,
+    NI,
+    DI,
+    HI,
+    WI,
+    max_in_elements,
+    max_out_elements
+  );
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_d,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_d,
+    int64_t* out_h,
+    int64_t* out_w,
+    cudaStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("cuda.ndhwc3to8.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_d",
+        x_dim2="*in_h",
+        x_dim3="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_d",
+        y_dim2="*out_h",
+        y_dim3="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("cuda.ndhwc3to8.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("cuda.ndhwc3to8.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_d="&" + xshape[1]._attrs["name"],
+        p_in_h="&" + xshape[2]._attrs["name"],
+        p_in_w="&" + xshape[3]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_d="&" + yshape[1]._attrs["name"],
+        p_out_h="&" + yshape[2]._attrs["name"],
+        p_out_w="&" + yshape[3]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to4.py b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
index fd67dd1ca..c07f8bc33 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to4.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to4.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -118,7 +118,7 @@
   const int nhw = NI * HI * WI;
   const int nhwc = nhw * 3;
   CHECK_EQ(nhw % 8, 0);
-  const int element_in_Tio = 8;
+  const int element_in_Tio = sizeof(int4) / sizeof(ElemT);
   const int max_input_element = nhwc / element_in_Tio;
   const int max_output_element = nhw * 4 / element_in_Tio;
   const int4 zero_io = {0, 0, 0, 0};
diff --git a/python/aitemplate/backend/cuda/padding/nhwc3to8.py b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
index 5f66c9be7..0f4e4eb52 100644
--- a/python/aitemplate/backend/cuda/padding/nhwc3to8.py
+++ b/python/aitemplate/backend/cuda/padding/nhwc3to8.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/padding/pad_last_dim.py b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
index 601da83ad..bc9ebe4e6 100644
--- a/python/aitemplate/backend/cuda/padding/pad_last_dim.py
+++ b/python/aitemplate/backend/cuda/padding/pad_last_dim.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -227,6 +227,8 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     elem_input_type2 = None
     if elem_input_type == "half":
         elem_input_type2 = "half2"
+    elif elem_input_type == "float":
+        elem_input_type2 = "float2"
     else:
         raise NotImplementedError(f"unsupported {elem_input_type=}")
     ndim = func_attrs["ndim"]
diff --git a/python/aitemplate/backend/cuda/pool2d/__init__.py b/python/aitemplate/backend/cuda/pool2d/__init__.py
index 2d21ced04..437cf7395 100644
--- a/python/aitemplate/backend/cuda/pool2d/__init__.py
+++ b/python/aitemplate/backend/cuda/pool2d/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA pool2d module init
 """
-from . import avg_pool2d, max_pool2d
+from aitemplate.backend.cuda.pool2d import avg_pool2d, max_pool2d
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
index bc18b9a99..4b9b1c26f 100644
--- a/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/avg_pool2d.py
@@ -18,17 +18,17 @@
 
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import pool2d
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.pool2d import pool2d
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}avg_pool_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}avg_pool_launcher<{{dtype}}, {{kernel_size}}, {{stride}}, {{padding}}>(
 {{indent}}    static_cast<const {{dtype}}*>(in_ptr),
 {{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
@@ -52,14 +52,18 @@
 namespace {
 
 template <int kernel_size, int stride, int padding>
-__global__ void avg_pool_f16_nhwc_kernel(const half2* input,
-                                         half2* output,
-                                         const int N,
-                                         const int H,
-                                         const int W,
-                                         const int C,
-                                         const int HO,
-                                         const int WO) {
+__global__ void avg_pool_nhwc_kernel(const {{dtype}}* input_raw,
+                                     {{dtype}}* output_raw,
+                                     const int N,
+                                     const int H,
+                                     const int W,
+                                     const int C,
+                                     const int HO,
+                                     const int WO) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
+
   const int tid = threadIdx.x;
   const int n_idx = blockIdx.x;
   const int out_h_idx = blockIdx.y;
@@ -85,33 +89,49 @@
       #pragma unroll
       for (int w = w_start_idx; w < w_end_idx; w++) {
         const int idx = (h * W + w) * C;
-        const half2 tmp = __ldg(input + (idx + c_idx));
+        const {{vec_dtype}} tmp = __ldg(input + (idx + c_idx));
+{% if dtype == "half" %}
         avg.x += __half2float(tmp.x);
         avg.y += __half2float(tmp.y);
+{% else %}
+        avg.x += tmp.x;
+        avg.y += tmp.y;
+{% endif %}
       }
     }
 
     avg.x *= norm_factor;
     avg.y *= norm_factor;
+{% if dtype == "half" %}
     output[c_idx] = __float22half2_rn(avg);
+{% else %}
+    output[c_idx] = avg;
+{% endif %}
   }
 }
 
-template <int kernel_size, int stride, int padding>
-void avg_pool_launcher(const cutlass::half_t* input,
-                      cutlass::half_t* output,
-                      const int N,
-                      const int H,
-                      const int W,
-                      const int C,
-                      const int HO,
-                      const int WO,
-                      cudaStream_t stream) {
-  int num_thread = (C / 2) < 256 ? C / 2 : 256;
+template <typename ElemT, int kernel_size, int stride, int padding>
+void avg_pool_launcher(const ElemT* input,
+                       ElemT* output,
+                       const int N,
+                       const int H,
+                       const int W,
+                       const int C,
+                       const int HO,
+                       const int WO,
+                       cudaStream_t stream)
+{
+  int num_thread = C / 2;
+  if (num_thread > 256) {
+      num_thread = 256;
+  } else if (num_thread == 0) {
+      num_thread = 1;
+  }
   dim3 grid(N, HO, WO);
   dim3 block(num_thread);
-  avg_pool_f16_nhwc_kernel<kernel_size, stride, padding><<<grid, block, 0, stream>>>(
-      (const half2*)input, (half2*)output, N, H, W, C / 2, HO, WO);
+  avg_pool_nhwc_kernel<kernel_size, stride, padding>
+      <<<grid, block, 0, stream>>>(input, output, N, H,
+                                   W, C / 2, HO, WO);
 }
 } // namespace
 
@@ -140,14 +160,14 @@
 @registry.reg("cuda.avg_pool2d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -177,10 +197,13 @@ def gen_function(
             stride=func_attrs["stride"],
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
index 2f1744a5e..5462d0134 100644
--- a/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/cuda/pool2d/max_pool2d.py
@@ -15,19 +15,20 @@
 """
 Codegen functions for max_pool2d.
 """
+
 import jinja2
 
-from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend import registry
 
-from ... import registry
-from . import pool2d
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.pool2d import pool2d
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
 
 EXEC_TEMPLATE = jinja2.Template(
     """
-{{indent}}max_pooling_launcher<{{kernel_size}}, {{stride}}, {{padding}}>(
+{{indent}}max_pooling_launcher<{{dtype}}, {{kernel_size}}, {{stride}}, {{padding}}>(
 {{indent}}    static_cast<const {{dtype}}*>(in_ptr),
 {{indent}}    static_cast<{{dtype}}*>(out_ptr),
 {{indent}}    NI,
@@ -57,15 +58,19 @@
           int block_ch,
           int block_h,
           int block_w>
-__global__ void max_pool_f16_nhwc_kernel(const half2* input,
-                                         half2* output,
-                                         const int N,
-                                         const int H,
-                                         const int W,
-                                         const int C,
-                                         const int HO,
-                                         const int WO) {
-  half2* shm = (half2*)shared_mem;
+__global__ void max_pool_nhwc_kernel(const {{dtype}}* input_raw,
+                                     {{dtype}}* output_raw,
+                                     const int N,
+                                     const int H,
+                                     const int W,
+                                     const int C,
+                                     const int HO,
+                                     const int WO) {
+{% set vec_dtype = {"half": "half2", "float": "float2"}[dtype] %}
+  const {{vec_dtype}}* input = (const {{vec_dtype}}*)input_raw;
+  {{vec_dtype}}* output = ({{vec_dtype}}*)output_raw;
+  {{vec_dtype}}* shm = ({{vec_dtype}}*)shared_mem;
+
   const int ldg_h = (block_h - 1) * stride + kernel_size;
   const int ldg_w = (block_w - 1) * stride + kernel_size;
   const int ldg_hw_num = ldg_h * ldg_w;
@@ -83,8 +88,13 @@
   const int hw_start_idx_of_thread = threadIdx.y;
   const int ch_thread_idx = threadIdx.x;
 
+{% if dtype == "half" %}
   const half2 min = {static_cast<half>(-65503.0f),
                      static_cast<half>(-65503.0f)};
+{% elif dtype == "float" %}
+  const float2 min = {-(std::numeric_limits<float>::max() - 1),
+                      -(std::numeric_limits<float>::max() - 1)};
+{% endif %}
 
   for (int i = hw_start_idx_of_thread; i < ldg_hw_num; i += block_ch) {
     const int shm_h_idx = i / ldg_w;
@@ -110,7 +120,7 @@
     const int out_w_idx = out_w_start_idx + out_w_offset;
     if (out_h_idx >= 0 && out_h_idx < HO && out_w_idx >= 0 &&
         out_w_idx < WO) {
-      half2 max = min;
+      auto max = min;
 
       const int shm_h_start_idx = out_h_offset * stride;
       const int shm_h_end_idx = shm_h_start_idx + kernel_size;
@@ -124,7 +134,7 @@
              shm_w_idx++) {
           const int shm_idx =
               (shm_h_idx * ldg_w + shm_w_idx) * C + ch_thread_idx;
-          const half2 tmp = shm[shm_idx];
+          const auto tmp = shm[shm_idx];
           max.x = (tmp.x > max.x) ? tmp.x : max.x;
           max.y = (tmp.y > max.y) ? tmp.y : max.y;
         }
@@ -135,9 +145,9 @@
   }
 }
 
-template<int kernel_size, int stride, int pad>
-void max_pooling_launcher(const cutlass::half_t* input,
-                          cutlass::half_t* output,
+template <typename ElemT, int kernel_size, int stride, int pad>
+void max_pooling_launcher(const ElemT* input,
+                          ElemT* output,
                           int NI,
                           int HI,
                           int WI,
@@ -151,13 +161,15 @@
   const int block_h = 4;
   const size_t shm_size = ((block_h - 1) * stride + kernel_size) *
                           ((block_w - 1) * stride + kernel_size) * CI *
-                          sizeof(half);
+                          sizeof(ElemT);
   dim3 grid(NI, (HO + block_h - 1) / block_h,
             (WO + block_w - 1) / block_w);
   dim3 block(CI / 2, block_ch);
-  max_pool_f16_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
-      <<<grid, block, shm_size, stream>>>((const half2*)input, (half2*)output, NI, HI,
-                                  WI, CI / 2, HO, WO);
+  auto kernel_func = max_pool_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>;
+  cudaFuncSetAttribute(kernel_func, cudaFuncAttributeMaxDynamicSharedMemorySize, shm_size);
+  max_pool_nhwc_kernel<kernel_size, stride, pad, 4, 4, 4>
+      <<<grid, block, shm_size, stream>>>(input, output, NI, HI,
+                                          WI, CI / 2, HO, WO);
 }
 } // namespace
 
@@ -186,14 +198,14 @@
 @registry.reg("cuda.max_pool2d.gen_function")
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     func_name = func_attrs["name"]
     exec_path = func_attrs["exec_path"]
     backend_spec = CUDASpec()
-    dtype = backend_spec.dtype_to_lib_type(func_attrs["inputs"][0]._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
     shape_eval_func = shape_eval_template.render(
         indent="  ",
         dtype="int64_t ",
@@ -223,10 +235,13 @@ def gen_function(
             stride=func_attrs["stride"],
             dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
-        function_name=func_name, shape_function=shape_func, exec_paths=exec_paths
+        function_name=func_name,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/reduce/__init__.py b/python/aitemplate/backend/cuda/reduce/__init__.py
index 0535d8a33..feb5cde4c 100644
--- a/python/aitemplate/backend/cuda/reduce/__init__.py
+++ b/python/aitemplate/backend/cuda/reduce/__init__.py
@@ -15,7 +15,14 @@
 """
 CUDA reduce module init
 """
-from . import reduce_3d, reduce_common, reduce_mean, reduce_sum, var, vector_norm
+from aitemplate.backend.cuda.reduce import (
+    reduce_3d,
+    reduce_common,
+    reduce_mean,
+    reduce_sum,
+    var,
+    vector_norm,
+)
 
 __all__ = [
     "reduce_3d",
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_3d.py b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
index aa3fb6ccb..a259d3974 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_3d.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_3d.py
@@ -24,10 +24,11 @@
 
 import jinja2
 
-from ...backend_spec import CUDASpec
-from ...common import tensor_accessor_codegen
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import tensor_accessor_codegen
 
-from . import reduce_small_axis
+from aitemplate.backend.cuda.reduce import reduce_small_axis
+from aitemplate.backend.target import Target
 
 
 DEFAULT_PROLOGUE_TEMPLATE = jinja2.Template(
@@ -192,6 +193,7 @@
 #include "cutlass/matrix_shape.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/tensor_ref.h"
+#include "cutlass/fast_math.h"
 
 #ifndef CHECK_ERROR_REDUCE
 #define CHECK_ERROR_REDUCE(expr)                             \\
@@ -272,7 +274,7 @@
   };
 
   struct SharedStorage {
-    cutlass::AlignedArray<ElementCompute, Shape::kCount> exchange;
+    cutlass::AlignedArray<ElementCompute, Shape::kCount, Shape::kCount * alignof(ElementCompute)> exchange;
   };
 
   CUTLASS_DEVICE
@@ -829,7 +831,13 @@ def gen_function(
     output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
     if accumulation_type is None:
         # follow pytorch's semantics
-        acc_type = output_type
+        if (
+            Target.current()._kwargs.get("use_fp16_acc", False)
+            and y._attrs["dtype"] == "float16"
+        ):
+            acc_type = output_type
+        else:
+            acc_type = "float"
     else:
         acc_type = accumulation_type
 
@@ -858,7 +866,9 @@ def gen_function(
 
     # FIXME: these alignments values are only for half_t type.
     # make it adjustable to other types such as float.
-    alignments = [16, 8, 4, 2, 1]
+    alignments = [8, 4, 2, 1]
+    if x._attrs["dtype"] in ("float16", "bfloat16"):
+        alignments.append(16)
     # This is ugly. Ideally, we should have templated code like below:
     # template <typename Alignment>
     # reduce_launcher(...) {
@@ -902,8 +912,9 @@ def gen_function(
     assert (
         len(output_accessors) == 1
     ), f"expected the length of output_accessors to be one but got {len(output_accessors)}"
+    dtype = func_attrs["inputs"][0].dtype()
     output_alignment = tensor_accessor_codegen.find_max_alignment_for_accessors(
-        output_accessors
+        dtype, output_accessors
     )
     special_exec_path, special_kernel = reduce_small_axis.get_exec_cond_and_kernel(
         func_attrs,
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_common.py b/python/aitemplate/backend/cuda/reduce/reduce_common.py
index ff8d65c12..43e40c7e7 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_common.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_common.py
@@ -17,8 +17,10 @@
 """
 import jinja2
 
-from ....compiler.base import IntImm, IntVar
-from ...backend_spec import CUDASpec
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntImm, IntVar
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
@@ -87,7 +89,7 @@
   using Layout = cutlass::layout::TensorNHWC;
   // Match pytorch's behavior where the accumuation type is the same
   // as the output type
-  using ElementCompute = ElemOutputType;
+  using ElementCompute = {{accumulation_type}};
   using ReductionOp = {{reduction_op}}<ElementCompute>;
   constexpr int NUM_DIMS = 4;
   assert(rank <= NUM_DIMS);
@@ -192,9 +194,8 @@ def gen_function(func_attrs, reduction_op):
     elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
-    elem_output_type = backend_spec.dtype_to_lib_type(
-        func_attrs["outputs"][0]._attrs["dtype"]
-    )
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
 
     vector_lens_config = [32, 16, 8, 4, 1]
     exec_paths = ""
@@ -212,11 +213,17 @@ def gen_function(func_attrs, reduction_op):
         workspace_ptr = "workspace"
     else:
         workspace_ptr = "nullptr"
+
+    accumulation_type = "float"
+    if Target.current()._kwargs.get("use_fp16_acc", False) and output_type == "float16":
+        accumulation_type = elem_output_type
+
     return SRC_TEMPLATE.render(
         func_name=func_attrs["name"],
         reduction_op=reduction_op,
         exec_paths=exec_paths,
         workspace_ptr=workspace_ptr,
+        accumulation_type=accumulation_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_mean.py b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
index 521e18a23..56fcf9fad 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_mean.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_mean.py
@@ -18,8 +18,8 @@
 
 import jinja2
 
-from ... import registry
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d
 
 
 EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
index 8bf4c6713..2db0f5524 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_small_axis.py
@@ -21,7 +21,7 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.compiler.base import IntImm
 
 
 EXEC_COND_TEMPLATE = jinja2.Template(
@@ -68,15 +68,16 @@
     """
 constexpr const int ThreadsPerBlock = 128;
 
-template <typename ElemT,
+template <typename ElementInput,
+          typename ElementOutput,
           typename ElementCompute,
           typename ReadVecT,
           typename WriteVecT,
           int64_t num_rows_per_thread,
           int64_t num_cols>
 __global__ void reduce_small_in_v_out_v(
-    ElemT *output,
-    const ElemT *input,
+    ElementOutput *output,
+    const ElementInput *input,
     int64_t num_rows,
     int64_t batch_stride_input,
     int64_t batch_stride_output) {
@@ -88,13 +89,13 @@
     return;
   // input within the batch
   int64_t input_offset = idx * num_cols;
-  const ElemT *this_input =
+  const ElementInput *this_input =
       input + block_batch * batch_stride_input + input_offset;
   size_t output_idx = block_batch * batch_stride_output + idx;
-  ElemT *this_output = get_strided_address_at_idx<ElemT, ElemT>(output, output_idx);
+  ElementOutput *this_output = get_strided_address_at_idx<ElementOutput, ElementOutput>(output, output_idx);
 
-  static_assert(sizeof(ReadVecT) % sizeof(ElemT) == 0);
-  constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElemT);
+  static_assert(sizeof(ReadVecT) % sizeof(ElementInput) == 0);
+  constexpr int n_read_elems_in_v = sizeof(ReadVecT) / sizeof(ElementInput);
   // number of original elements
   constexpr int64_t num_elems_per_thread = num_rows_per_thread * num_cols;
   // number of vector elements
@@ -114,7 +115,7 @@
 
   // compute
   using FragmentCompute = ElementCompute;
-  ElemT *read_elems = reinterpret_cast<ElemT *>(read_elems_v);
+  ElementInput *read_elems = reinterpret_cast<ElementInput *>(read_elems_v);
   using ReduceScalarOp = {{reduce_op}}<ElementCompute>;
   ReduceScalarOp reduce_s_op;
   constexpr int num_reduced_elems = num_cols;
@@ -126,8 +127,9 @@
     {{epilogue_scalar_code}}
   };
 
-  ElemT reduced_elems[num_rows_per_thread];
+  ElementOutput reduced_elems[num_rows_per_thread];
   static_assert(num_elems_per_thread % num_cols == 0);
+  cutlass::NumericConverter<ElementCompute, ElementInput> convert_input;
   CUTLASS_PRAGMA_UNROLL
   for (int64_t i = 0; i < num_elems_per_thread / num_cols; i++) {
     static_assert(num_elems_per_thread % num_rows_per_thread == 0);
@@ -135,17 +137,17 @@
     CUTLASS_PRAGMA_UNROLL
     for (int64_t j = 0; j < num_cols; j++) {
       int64_t read_idx = i * num_cols + j;
-      FragmentCompute tmp = prologue_fn(read_elems[read_idx]);
+      FragmentCompute tmp = prologue_fn(convert_input(read_elems[read_idx]));
       frag_compute = reduce_s_op(frag_compute, tmp);
     }
-    cutlass::NumericConverter<ElemT, ElementCompute> convert_output;
+    cutlass::NumericConverter<ElementOutput, ElementCompute> convert_output;
     ElementCompute tmp = epilogue_scalar_fn(frag_compute);
     reduced_elems[i] = convert_output(tmp);
   }
 
   WriteVecT *this_output_v = reinterpret_cast<WriteVecT*>(this_output);
   WriteVecT *reduced_elems_v = reinterpret_cast<WriteVecT*>(&reduced_elems[0]);
-  constexpr int n_write_elems_in_v = sizeof(WriteVecT) / sizeof(ElemT);
+  constexpr int n_write_elems_in_v = sizeof(WriteVecT) / sizeof(ElementOutput);
   CUTLASS_PRAGMA_UNROLL
 {% if output_accessor.is_contiguous %}
   for (int64_t i = 0; i < num_rows_per_thread / n_write_elems_in_v; i++) {
@@ -198,8 +200,9 @@
   if (num_rows % num_rows_per_thread == 0) {
 
 #define HANDLE_ONE_WRITE_VEC(write_bytes, write_vec_type) \\
-    case write_bytes:                                     \\
+    if (write_bytes == num_write_bytes_v) {               \\
       reduce_small_in_v_out_v<ElemInputType,              \\
+                              ElemOutputType,             \\
                               ElemComputeType,            \\
                               {{read_vec_type}},          \\
                               write_vec_type,             \\
@@ -211,20 +214,22 @@
           num_rows,                                       \\
           batch_stride_input,                             \\
           batch_stride_output);                           \\
-      break;                                              \\
-
-    switch(num_write_bytes_v) {
-      HANDLE_ONE_WRITE_VEC(16, uint4)
-      HANDLE_ONE_WRITE_VEC(8, uint2)
-      HANDLE_ONE_WRITE_VEC(4, unsigned)
+      LAUNCH_CHECK_REDUCE();                              \\
+      return;                                             \\
+    }
+    HANDLE_ONE_WRITE_VEC(16, uint4)
+    HANDLE_ONE_WRITE_VEC(8, uint2)
+    HANDLE_ONE_WRITE_VEC(4, unsigned)
+    if constexpr (std::is_same_v<ElemOutputType, cutlass::half_t>) {
       HANDLE_ONE_WRITE_VEC(2, cutlass::half_t)
-      default:
-        throw std::runtime_error("unsupported vector size for write");
     }
+    else if constexpr (std::is_same_v<ElemOutputType, cutlass::bfloat16_t>) {
+      HANDLE_ONE_WRITE_VEC(2, cutlass::bfloat16_t)
+    }
+    throw std::runtime_error("unsupported vector size for write");
   } else {
     throw std::runtime_error("unsupported num_row_per_threads");
   }
-  LAUNCH_CHECK_REDUCE();
 }
 
 template <typename ElemOutputType, typename ElemInputType>
@@ -270,6 +275,8 @@ def _get_read_vector_type(input_shape, input_type, force_min_vec_type=False) ->
     type_to_size_in_bit = {
         "half": 16,
         "cutlass::half_t": 16,
+        "bfloat16": 16,
+        "cutlass::bfloat16_t": 16,
         "float": 32,
     }
 
@@ -278,22 +285,40 @@ def _get_read_vector_type(input_shape, input_type, force_min_vec_type=False) ->
     # (2) the input type is inherited from reduce_3d, so we still
     #     use cutlass::half_t for fp16. We will replace it to half once we
     #     unify our half representation
-    vector_types = [
-        ("uint4", 16),
-        ("uint2", 8),
-        ("unsigned", 4),
-        ("cutlass::half_t", 2),
-    ]
+    vector_types = {
+        "cutlass::half_t": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+            ("cutlass::half_t", 2),
+        ],
+        "cutlass::bfloat16_t": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+            ("cutlass::bfloat16_t", 2),
+        ],
+        "float": [
+            ("uint4", 16),
+            ("uint2", 8),
+            ("unsigned", 4),
+        ],
+    }
 
     def _size_to_vector_type(sz_in_byte) -> str:
         """return vector_type for the given size"""
-        for vec_type, sz in vector_types:
+        for vec_type, sz in vector_types[input_type]:
             if sz_in_byte % sz == 0:
                 return vec_type
         raise NotImplementedError("Unsupported vector size: {}".format(sz_in_byte))
 
     reduction_axis = -1
-    assert isinstance(input_shape[reduction_axis], IntImm)
+
+    if not isinstance(input_shape[reduction_axis], IntImm):
+        # the last dimension is IntVar, so the best we can do in
+        # terms of the read vector type is the input_type iteself
+        return input_type
+
     rank = len(input_shape)
     reduction_dim_val = input_shape[reduction_axis]._attrs["values"][0]
     input_type_sz_in_bit = type_to_size_in_bit.get(input_type)
@@ -336,7 +361,7 @@ def _valid_vector_type(vec_type, sz_in_byte):
             return False
         return True
 
-    for vec_type, sz in vector_types:
+    for vec_type, sz in vector_types[input_type]:
         if _valid_vector_type(vec_type, sz):
             return vec_type
 
diff --git a/python/aitemplate/backend/cuda/reduce/reduce_sum.py b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
index a30c91bfb..fa3406ba7 100644
--- a/python/aitemplate/backend/cuda/reduce/reduce_sum.py
+++ b/python/aitemplate/backend/cuda/reduce/reduce_sum.py
@@ -24,8 +24,8 @@
 epilogue so it is more general than reduce_common.
 """
 
-from ... import registry
-from . import reduce_3d, reduce_common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d, reduce_common
 
 
 def _is_last_reduction_dim(func_attrs):
diff --git a/python/aitemplate/backend/cuda/reduce/var.py b/python/aitemplate/backend/cuda/reduce/var.py
index 7e61c8445..80b5dc336 100644
--- a/python/aitemplate/backend/cuda/reduce/var.py
+++ b/python/aitemplate/backend/cuda/reduce/var.py
@@ -21,9 +21,10 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.reduce import reduce_3d
+from aitemplate.backend.target import Target
 
 
 EXTRA_CODE_TEMPLATE = jinja2.Template(
@@ -66,7 +67,7 @@
       int new_count = new_data.count + count;
       ElementT nb_over_n = ElementT(new_data.count) / ElementT(new_count);
       mean = mean + delta * nb_over_n;
-      m2 =  m2 + new_data.m2 + delta * delta * count * nb_over_n;
+      m2 =  m2 + new_data.m2 + delta * delta * nb_over_n * ElementT(count);
       return WelfordData(new_count, mean, m2);
     }
 
@@ -112,20 +113,53 @@
     : "r"(ptr));
 }
 
+template <>
+CUTLASS_DEVICE
+void shared_load<48>(void *dst, uint32_t ptr) {
+  uint4 *dst_u128 = reinterpret_cast<uint4 *>(dst);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+
+  dst_u128++;
+  ptr = ptr + sizeof(uint4);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+
+  dst_u128++;
+  ptr = ptr + sizeof(uint4);
+  asm volatile("ld.shared.v4.u32 {{ '{%0, %1, %2, %3}, [%4]' }};\\n"
+    :
+      "=r"(dst_u128->x),
+      "=r"(dst_u128->y),
+      "=r"(dst_u128->z),
+      "=r"(dst_u128->w)
+    : "r"(ptr));
+}
+
 } // namespace arch
 
 template <typename ElementT, bool BesselCorrection>
-struct NumericConverter<WelfordData<ElementT, BesselCorrection>,
+struct NumericConverter<WelfordData<{{acc_type}}, BesselCorrection>,
                         ElementT,
                         FloatRoundStyle::round_to_nearest> {
 
-  using result_type = WelfordData<ElementT, BesselCorrection>;
+  using result_type = WelfordData<{{acc_type}}, BesselCorrection>;
   using source_type = ElementT;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
   static result_type convert(source_type const & s) {
-    return WelfordData<ElementT, BesselCorrection>(-1, s, ElementT(0));
+    return WelfordData<{{acc_type}}, BesselCorrection>(-1, static_cast<{{acc_type}}>(s), {{acc_type}}(0));
   }
 
   CUTLASS_HOST_DEVICE
@@ -136,11 +170,11 @@
 
 template <typename ElementT, bool BesselCorrection>
 struct NumericConverter<ElementT,
-                        WelfordData<ElementT, BesselCorrection>,
+                        WelfordData<{{acc_type}}, BesselCorrection>,
                         FloatRoundStyle::round_to_nearest> {
 
   using result_type = ElementT;
-  using source_type = WelfordData<ElementT, BesselCorrection>;
+  using source_type = WelfordData<{{acc_type}}, BesselCorrection>;
   static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest;
 
   CUTLASS_HOST_DEVICE
@@ -150,14 +184,14 @@
       if (s.count <= 1) {
         return ElementT(nanf("Not a Number"));
       } else {
-        return s.m2 / ElementT((int)(s.count - 1));
+        return ElementT(s.m2) / ElementT((int)(s.count - 1));
       }
     } else {
       // sample variance
       if (s.count <= 0) {
         return ElementT(nanf("Not a Number"));
       } else {
-        return s.m2 / ElementT((int)(s.count));
+        return ElementT(s.m2) / ElementT((int)(s.count));
       }
     }
   }
@@ -261,17 +295,20 @@ def var_gen_function(func_attrs) -> str:
     """
     bessel = "true" if func_attrs["unbiased"] else "false"
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
-        func_attrs["inputs"][0]._attrs["dtype"]
-    )
-    acc_type = f"WelfordData<{elem_input_type}, {bessel}>"
+    output_type = func_attrs["outputs"][0]._attrs["dtype"]
+    elem_output_type = backend_spec.dtype_to_lib_type(output_type)
+
+    acc_type = "float"
+    if Target.current()._kwargs.get("use_fp16_acc", False) and output_type == "float16":
+        acc_type = elem_output_type
+    welford_type = f"WelfordData<{acc_type}, {bessel}>"
     return reduce_3d.gen_function(
         func_attrs,
         "cutlass::welford_op",
         reduce_3d.DEFAULT_PROLOGUE_TEMPLATE,
         reduce_3d.DEFAULT_EPILOGUE_SCALAR_TEMPLATE,
-        EXTRA_CODE_TEMPLATE.render(),
-        accumulation_type=acc_type,
+        EXTRA_CODE_TEMPLATE.render(acc_type=acc_type),
+        accumulation_type=welford_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/reduce/vector_norm.py b/python/aitemplate/backend/cuda/reduce/vector_norm.py
index 21bf195e5..2159ed7f2 100644
--- a/python/aitemplate/backend/cuda/reduce/vector_norm.py
+++ b/python/aitemplate/backend/cuda/reduce/vector_norm.py
@@ -18,8 +18,8 @@
 
 import jinja2
 
-from ... import registry
-from . import reduce_3d
+from aitemplate.backend import registry
+from aitemplate.backend.cuda.reduce import reduce_3d
 
 
 L2_NORM_PROLOGUE_TEMPLATE = jinja2.Template(
@@ -34,7 +34,7 @@
 L2_NORM_EPILOGUE_SCALAR_TEMPLATE = jinja2.Template(
     """
 {{indent}}cutlass::NumericConverter<ElementCompute, float> local_converter;
-{{indent}}return local_converter(fast_sqrt(reduced_result));
+{{indent}}return local_converter(cutlass::fast_sqrt(reduced_result));
 """
 )
 
diff --git a/python/aitemplate/backend/cuda/softmax/__init__.py b/python/aitemplate/backend/cuda/softmax/__init__.py
index 3b3b17330..615fd1954 100644
--- a/python/aitemplate/backend/cuda/softmax/__init__.py
+++ b/python/aitemplate/backend/cuda/softmax/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import softmax
+from aitemplate.backend.cuda.softmax import softmax
 
 __all__ = ["softmax"]
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.cuh b/python/aitemplate/backend/cuda/softmax/softmax.cuh
index 8a6e2317e..2b2c33811 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.cuh
+++ b/python/aitemplate/backend/cuda/softmax/softmax.cuh
@@ -16,6 +16,84 @@
 #ifndef CUDA_SOFTMAX
 #define CUDA_SOFTMAX
 
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <math_constants.h>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+
+using bfloat16 = nv_bfloat16;
+
+#define SOFTMAX_DEVICE_CHECK(call)                                   \
+  if ((call) != cudaSuccess) {                                       \
+    throw std::runtime_error(                                        \
+        std::string("softmax kernel call failed: ") +                \
+        cudaGetErrorString(cudaGetLastError()) + " at " + __FILE__ + \
+        ", line" + std::to_string(__LINE__));                        \
+  }
+
+#define SOFTMAX_LAUNCH_CHECK() SOFTMAX_DEVICE_CHECK(cudaGetLastError())
+
+// unroll directives copied from CUTLASS
+#if defined(__CUDA_ARCH__)
+#if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
+#define PRAGMA_UNROLL _Pragma("unroll")
+#else
+#define PRAGMA_UNROLL #pragma unroll
+#endif // __CUDACC_RTC__
+
+#else
+#define PRAGMA_UNROLL
+#endif // __CUDA_ARCH__
+
+namespace {
+
+template <typename T>
+__inline__ __device__ T fast_max(const T a, const T b);
+
+template <typename T>
+__inline__ __device__ T fast_exp(const T a);
+
+template <>
+__inline__ __device__ half fast_max(const half a, const half b) {
+#if (__CUDA_ARCH__ >= 800)
+  return __hmax(a, b);
+#else
+  return a > b ? a : b;
+#endif
+}
+
+template <>
+__inline__ __device__ float fast_max(const float a, const float b) {
+  return fmaxf(a, b);
+}
+
+template <>
+__inline__ __device__ half fast_exp(const half a) {
+  return hexp(a);
+}
+
+template <>
+__inline__ __device__ float fast_exp(const float a) {
+  return __expf(a);
+}
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+template <>
+__inline__ __device__ bfloat16 fast_exp(const bfloat16 a) {
+  return hexp(a);
+}
+
+template <>
+__inline__ __device__ bfloat16 fast_max(const bfloat16 a, const bfloat16 b) {
+  return __hmax(a, b);
+}
+
+#endif
+
 template <typename T>
 __inline__ __device__ T Inf();
 
@@ -31,7 +109,7 @@ __inline__ __device__ double Inf<double>() {
 
 template <typename T>
 struct Arguments {
-  T* input;
+  const T* input;
   T* output;
 };
 
@@ -120,6 +198,8 @@ __inline__ __device__ T blockReduceMax(T* val) {
   return (T)0.0f;
 }
 
+} // namespace
+
 // input size: [M, K]
 // Currently the softmax kernel only supports 2D input with dim=1.
 // For input with more dimensions, reshape first.
@@ -146,7 +226,7 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
   constexpr bool can_use_vector_load = ((m * K) % vector_len) == 0;
   // read input
   if (can_use_vector_load && m_idx + m < M) {
-    VECTORIZED_TYPE* input = reinterpret_cast<VECTORIZED_TYPE*>(args.input);
+    auto input = reinterpret_cast<const VECTORIZED_TYPE*>(args.input);
     VECTORIZED_TYPE* output = reinterpret_cast<VECTORIZED_TYPE*>(args.output);
 
     const size_t offset = (m_idx * K) / vector_len;
@@ -160,42 +240,42 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
     VECTORIZED_TYPE input_tile_vec[n_tile];
     T* input_tile = reinterpret_cast<T*>(&input_tile_vec);
 
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < n_tile; i++) {
       input_tile_vec[i] = input[i];
     }
 
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < m; i++) {
-      T max = platform::numeric_limits<T>::lowest();
+      T max = std::numeric_limits<T>::lowest();
       // find max
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
-        max = cutlass::fast_max(input_tile[i * K + j], max);
+        max = fast_max(input_tile[i * K + j], max);
       }
       // get sum
       float sum = 0;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
-        input_tile[tile_idx] = cutlass::fast_exp(input_tile[tile_idx] - max);
+        input_tile[tile_idx] = fast_exp(input_tile[tile_idx] - max);
         sum += static_cast<float>(input_tile[tile_idx]);
       }
       // normalize
       const float sum_inverse = 1.0 / sum;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
         input_tile[tile_idx] = static_cast<T>(
             static_cast<float>(input_tile[tile_idx]) * sum_inverse);
       }
     }
-    CUTLASS_PRAGMA_UNROLL
+    PRAGMA_UNROLL
     for (size_t i = 0; i < n_tile; i++) {
       output[i] = input_tile_vec[i];
     }
   } else {
-    T* input = args.input;
+    const T* input = args.input;
     T* output = args.output;
 
     const size_t offset = m_idx * K;
@@ -209,34 +289,34 @@ __global__ void softmax_small_k(Arguments<T> args, size_t M) {
       T input_tile[K];
 
       // read input
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         input_tile[j] = input[i * K + j];
       }
 
-      T max = platform::numeric_limits<T>::lowest();
+      T max = std::numeric_limits<T>::lowest();
       // find max
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
-        max = cutlass::fast_max(input_tile[j], max);
+        max = fast_max(input_tile[j], max);
       }
       // get sum
       float sum = 0;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         const int tile_idx = i * K + j;
-        input_tile[j] = cutlass::fast_exp(input_tile[j] - max);
+        input_tile[j] = fast_exp(input_tile[j] - max);
         sum += static_cast<float>(input_tile[j]);
       }
       // normalize
       float sum_inverse = 1.0 / sum;
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         input_tile[j] =
             static_cast<T>(static_cast<float>(input_tile[j]) * sum_inverse);
       }
       // write output
-      CUTLASS_PRAGMA_UNROLL
+      PRAGMA_UNROLL
       for (size_t j = 0; j < K; j++) {
         output[i * K + j] = input_tile[j];
       }
@@ -535,4 +615,243 @@ inline cudaError_t LaunchSoftmaxBlockAll(
   return cudaSuccess;
 }
 
+template <typename T, int K, size_t TileSize>
+void LaunchSoftmaxSmallK(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  const int n_threads = 128;
+  const int tile_size_by_n_threads = TileSize * n_threads;
+  dim3 block(n_threads);
+  dim3 grid((batch_size + tile_size_by_n_threads - 1) / tile_size_by_n_threads);
+  softmax_small_k<T, float4, n_threads, K, TileSize>
+      <<<grid, block, 0, stream>>>({input, output}, batch_size);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T>
+struct VecTFor;
+
+template <>
+struct VecTFor<half> {
+  using vec8 = float4;
+  using vec4 = float2;
+  using vec2 = float;
+};
+
+template <>
+struct VecTFor<float> {
+  using vec8 = float8;
+  using vec4 = float4;
+  using vec2 = float2;
+};
+
+template <>
+struct VecTFor<bfloat16> {
+  using vec8 = float4;
+  using vec4 = float2;
+  using vec2 = float;
+};
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK8Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 8, 16, 32}) {
+    if (8 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec8 = typename VecTFor<T>::vec8;
+  softmax_stored_locally_multi_dim<vec8, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec8*>(input),
+      reinterpret_cast<vec8*>(output),
+      batch_size,
+      NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK8Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 7) / 8;
+  const int cols_per_thread = num_packs * 8;
+  using vec8 = typename VecTFor<T>::vec8;
+  softmax_stored_locally_multi_dim<vec8, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec8*>(input),
+          reinterpret_cast<vec8*>(output),
+          batch_size,
+          NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK4Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 4, 8, 16, 32}) {
+    if (4 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec4 = typename VecTFor<T>::vec4;
+  softmax_stored_locally_multi_dim<vec4, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec4*>(input),
+      reinterpret_cast<vec4*>(output),
+      batch_size,
+      NElements);
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK4Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 3) / 4;
+  const int cols_per_thread = num_packs * 8;
+  using vec4 = typename VecTFor<T>::vec4;
+
+  softmax_stored_locally_multi_dim<vec4, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec4*>(input),
+          reinterpret_cast<vec4*>(output),
+          batch_size,
+          NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK2Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 2, 4, 8, 16, 32}) {
+    if (2 * i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+  using vec2 = typename VecTFor<T>::vec2;
+
+  softmax_stored_locally_multi_dim<vec2, T, 8><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const vec2*>(input),
+      reinterpret_cast<vec2*>(output),
+      batch_size,
+      NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK2Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int num_packs = (int((NElements + 31) / 32) + 1) / 2;
+  const int cols_per_thread = num_packs * 2;
+  using vec2 = typename VecTFor<T>::vec2;
+
+  softmax_stored_locally_multi_dim<vec2, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(
+          reinterpret_cast<const vec2*>(input),
+          reinterpret_cast<vec2*>(output),
+          batch_size,
+          NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK1Small(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_width = -1;
+  for (auto i : {1, 2, 4, 8, 16, 32}) {
+    if (i >= NElements) {
+      thread_group_width = i;
+      break;
+    }
+  }
+  int thread_group_per_block = 128 / thread_group_width;
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(thread_group_width, thread_group_per_block);
+
+  softmax_stored_locally_multi_dim<T, T, 8>
+      <<<grid, block, 0, stream>>>(input, output, batch_size, NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
+template <typename T, size_t NElements>
+void LaunchSoftmaxK1Middle(
+    const T* input,
+    T* output,
+    size_t batch_size,
+    cudaStream_t stream) {
+  int thread_group_per_block = 128 / 32; // 4
+  int grid_dim_x =
+      (batch_size + thread_group_per_block - 1) / thread_group_per_block;
+  dim3 grid(grid_dim_x);
+  dim3 block(32, thread_group_per_block);
+  const int cols_per_thread = (NElements + 31) / 32;
+
+  softmax_stored_locally_multi_dim<T, T, cols_per_thread>
+      <<<grid, block, 0, stream>>>(input, output, batch_size, NElements);
+
+  SOFTMAX_LAUNCH_CHECK();
+}
+
 #endif
diff --git a/python/aitemplate/backend/cuda/softmax/softmax.py b/python/aitemplate/backend/cuda/softmax/softmax.py
index 68a0eec7e..59a69865a 100644
--- a/python/aitemplate/backend/cuda/softmax/softmax.py
+++ b/python/aitemplate/backend/cuda/softmax/softmax.py
@@ -21,11 +21,11 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...target import Target
+from aitemplate.compiler.base import IntImm
 
 # pylint: disable=C0301, C0116
 
@@ -35,185 +35,77 @@
 # i.e. for each K, we only need to compile one of the implementation, not all.
 #
 # For each K, whether to use wrapReduce or blockReduce was done by experiment
-# Please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-# and this experiment log: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
+# Please refer to this post: https://github.com/facebookincubator/AITemplate/wiki/How-to-write-a-fast-Softmax-CUDA-kernel%3F
+# and this experiment log [fb internal only]: https://docs.google.com/spreadsheets/d/1bl3GCLQ67p27kXOSVJikEob38fojqaZIS--mPdQxeo0/edit#gid=931264442
 FUNC_TEMPLATE = jinja2.Template(
     """
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include "cutlass/cutlass.h"
-#include "cutlass/fast_math.h"
-#include "cutlass/platform/platform.h"
-#include <math_constants.h>
-#include <assert.h>
-#include <cuda.h>
-namespace {
-
 {{custom_libs}}
 
-}  // namespace
-
 {{func_signature}}
 {
   {{shape_functions}}
-  size_t m0 = {{m}};
-  size_t n = {{K}};
   size_t m = M;
   bool success = true;
 
+  // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
   {% if K <= 32 and K % 4 == 0 or K <= 8 %}
-    const int n_threads = 128;
-    const int m0_by_n_threads = m0 * n_threads;
-    dim3 block(n_threads);
-    dim3 grid((m + m0_by_n_threads - 1) / m0_by_n_threads);
-    Arguments<{{dtype}}> args = {
-      static_cast<{{dtype}}*>(input), static_cast<{{dtype}}*>(output)
-    };
-    softmax_small_k<{{dtype}}, float4, n_threads, {{K}}, {{m}}>
-        <<<grid, block, 0, stream>>>(args, m);
+    // K <= 32 and K % 4 == 0 or K <= 8
+    LaunchSoftmaxSmallK<{{dtype}}, {{K}}, {{m}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
   {% elif K % 8 == 0 %}
+    // K % 8 == 0: vector8 kernels
     {% if K/8 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 8, 16, 32}){
-        if (8*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float8,{{dtype}},8><<<grid, block, 0, stream>>>( (const float8*)input, (float8*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
-      {% endif %}
-    {% elif K <= 3840 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+7)/8;
-      const int cols_per_thread = num_packs * 8;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float8,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float8*)input, (float8*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 3840 %}
-        LaunchSoftmaxBlockAll<float8,{{dtype}},{{K}}>( (const float8*) input, (float8*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 3840 %}
-        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
+      // K/8 <= 32
+      LaunchSoftmaxK8Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 3840 %}
+      // 32 < K/8 <= 480
+      LaunchSoftmaxK8Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 3840 %}
+      // K/8 > 480
+      using vec8 = VecTFor<{{dtype}}>::vec8;
+      LaunchSoftmaxBlockAll<vec8, {{dtype}}, {{K}}>(reinterpret_cast<const vec8*>(input), reinterpret_cast<vec8*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 4 == 0 %}
+    // K % 4 == 0: vector4 kernels
     {% if K/4 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 4, 8, 16, 32}){
-        if (4*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},8><<<grid, block, 0, stream>>>( (const float4*)input, (float4*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
-      {% endif %}
-    {% elif K <= 1920 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+3)/4;
-      const int cols_per_thread = num_packs * 8;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float4,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float4*)input, (float4*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1920 %}
-        LaunchSoftmaxBlockAll<float4,{{dtype}},{{K}}>( (const float4*) input, (float4*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1920 %}
-        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
+      // K/4 <= 32
+      LaunchSoftmaxK4Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1920 %}
+      // 32 < K/4 <= 480
+      LaunchSoftmaxK4Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1920 %}
+      // K/4 > 480
+      using vec4 = VecTFor<{{dtype}}>::vec4;
+      LaunchSoftmaxBlockAll<vec4, {{dtype}}, {{K}}>(reinterpret_cast<const vec4*>(input), reinterpret_cast<vec4*>(output), M, stream, &success);
     {% endif %}
   {% elif K % 2 == 0 %}
+    // K % 2 == 0: vector2 kernels
     {% if K/2 <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 2, 4, 8, 16, 32}){
-        if (2*i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},8><<<grid, block, 0, stream>>>( (const float2*)input, (float2*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
-      {% endif %}
-    {% elif K <= 1152 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int num_packs = (int(({{K}}+31)/32)+1)/2;
-      const int cols_per_thread = num_packs * 2;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float2,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float2*)input, (float2*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1152 %}
-        LaunchSoftmaxBlockAll<float2,{{dtype}},{{K}}>( (const float2*) input, (float2*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1152 %}
-        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
+      // K/2 <= 32
+      LaunchSoftmaxK2Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1152 %}
+      // 32 < K/2 <= 576
+      LaunchSoftmaxK2Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1152 %}
+      // K/2 > 576
+      using vec2 = VecTFor<{{dtype}}>::vec2;
+      LaunchSoftmaxBlockAll<vec2, {{dtype}}, {{K}}>(reinterpret_cast<const vec2*>(input), reinterpret_cast<vec2*>(output), M, stream, &success);
     {% endif %}
   {% else %}
-    {% if K <=32 %}
-      int thread_group_width = -1;
-      for(auto i: {1, 2, 4, 8, 16, 32}){
-        if (i >= n){
-          thread_group_width = i;
-          break;
-        }
-      }
-      int thread_group_per_block = 128/thread_group_width;
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(thread_group_width, thread_group_per_block);
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},8><<<grid, block, 0, stream>>>( (const float*)input, (float*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<half,{{dtype}},8><<<grid, block, 0, stream>>>( (const half*)input, (half*)output, m, n);
-      {% endif %}
-    {% elif K <= 1408 %} // For threshold K, please refer to this post: https://fb.quip.com/HCfIAbpWB0qi
-      int thread_group_per_block = 128/32;//4
-      int grid_dim_x = (m+thread_group_per_block-1)/thread_group_per_block;
-      dim3 grid(grid_dim_x);
-      dim3 block(32,thread_group_per_block);
-      const int cols_per_thread = ({{K}}+31)/32;
-      {% if dtype=="float" %}
-        softmax_stored_locally_multi_dim<float,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const float*)input, (float*)output, m, n);
-      {% elif "half" in dtype %}
-        softmax_stored_locally_multi_dim<half,{{dtype}},cols_per_thread><<<grid, block, 0, stream>>>((const half*)input, (half*)output, m, n);
-      {% endif %}
-    {% elif dtype=="float" and K > 1408 %}
-        LaunchSoftmaxBlockAll<float,{{dtype}},{{K}}>( (const float*) input, (float*) output, m, stream, &success);
-    {% elif "half" in dtype and K > 1408 %}
-        LaunchSoftmaxBlockAll<half,{{dtype}},{{K}}>( (const half*) input, (half*) output, m, stream, &success);
+    // odd K
+    {% if K <= 32 %}
+      // K <= 32
+      LaunchSoftmaxK1Small<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K <= 1408 %}
+      // 32 < K <= 1408
+      LaunchSoftmaxK1Middle<{{dtype}}, {{K}}>(static_cast<const {{dtype}}*>(input), static_cast<{{dtype}}*>(output), M, stream);
+    {% elif K > 1408 %}
+      // K > 1408
+      LaunchSoftmaxBlockAll<{{dtype}}, {{dtype}}, {{K}}>( (const {{dtype}}*) input, ({{dtype}}*) output, m, stream, &success);
     {% endif %}
   {% endif %}
 
-  if(!success){
-    softmaxBlockNocache<half><<<m, 1024, 0, stream>>>((half*)input, (half*)output, m, n);
+  if (!success) {
+    softmaxBlockNocache<{{dtype}}><<<m, 1024, 0, stream>>>(({{dtype}}*)input, ({{dtype}}*)output, m, {{K}});
   }
 }
     """
@@ -222,7 +114,7 @@
 SHAPE_FUNCTIONS = jinja2.Template(
     """
     int64_t M = 1;
-{% for idx in range(input_ndim - 1) %}
+{% for idx in range(reduction_dim) %}
     M *= *in_{{idx}};
 {% endfor %}
     """
@@ -231,18 +123,19 @@
 FUNC_SIGNATURE = jinja2.Template(
     """
 void {{func_name}}(void* input,
-                   void* output,
-{% for idx in range(input_ndim - 1) %}
-                   int64_t* in_{{idx}},
+               void* output,
+{% for idx in range(reduction_dim) %}
+               int64_t* in_{{idx}},
 {% endfor %}
-                   cudaStream_t stream)
-    """
+               cudaStream_t stream)
+    """,
+    trim_blocks=True,
 )
 
 FUNC_DECL = jinja2.Template(
     """
-    {{func_signature}};
-    """
+{{func_signature}};
+    """,
 )
 
 FUNC_CALL_TEMPLATE = jinja2.Template(
@@ -250,20 +143,20 @@
 {{indent}}{{func_name}}(
 {{indent}}   {{input}},
 {{indent}}   {{output}},
-{% for name in input_dim_names[:-1] %}
-{{indent}}    &{{name}},
+{% for name in outer_dim_names %}
+{{indent}}   &{{name}},
 {% endfor %}
 {{indent}}   stream
 {{indent}});
-    """
+    """,
+    trim_blocks=True,
 )
 
 
 def get_func_signature(func_attrs: Dict[str, Any]) -> str:
-    input_ndim = func_attrs["inputs"][0]._rank()
     return FUNC_SIGNATURE.render(
         func_name=func_attrs["name"],
-        input_ndim=input_ndim,
+        reduction_dim=func_attrs["dim"],
     ).strip()
 
 
@@ -286,11 +179,6 @@ def find_tile_size(k: int) -> int:
 def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
     dim = func_attrs["dim"]
     shapes = func_attrs["inputs"][0]._attrs["shape"]
-    rank = len(shapes)
-
-    assert (
-        dim == rank - 1
-    ), f"softmax only supports dim == rank - 1, dim={dim}, rank={rank}"
 
     assert isinstance(
         shapes[dim], IntImm
@@ -299,7 +187,7 @@ def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
     k = shapes[dim].value()
 
     backend_spec = CUDASpec()
-    elem_input_type = backend_spec.dtype_to_lib_type(
+    elem_input_type = backend_spec.dtype_to_backend_type(
         func_attrs["inputs"][0]._attrs["dtype"]
     )
     return FUNC_TEMPLATE.render(
@@ -307,7 +195,7 @@ def softmax_gen_function(func_attrs: Dict[str, Any]) -> str:
             os.path.dirname(__file__), "softmax.cuh"
         ),
         func_signature=get_func_signature(func_attrs),
-        shape_functions=SHAPE_FUNCTIONS.render(input_ndim=rank),
+        shape_functions=SHAPE_FUNCTIONS.render(reduction_dim=dim),
         dtype=elem_input_type,
         K=k,
         m=find_tile_size(k),
@@ -327,17 +215,18 @@ def softmax_gen_function_call(func_attrs, indent="  "):
     input_name = func_attrs["inputs"][0]._attrs["name"]
     output_name = func_attrs["outputs"][0]._attrs["name"]
 
-    shapes = func_attrs["inputs"][0]._attrs["shape"]
+    shape = func_attrs["inputs"][0]._attrs["shape"]
     assert (
-        len(shapes) >= 2
-    ), f"Softmax only supports input with rank >= 2, current rank: {len(shapes)}"
+        len(shape) >= 2
+    ), f"Softmax only supports input with rank >= 2, current rank: {len(shape)}"
 
-    input_dim_names = [shape._attrs["name"] for shape in shapes]
+    reduction_dim = func_attrs["dim"]
+    outer_dim_names = [dim._attrs["name"] for dim in shape[:reduction_dim]]
 
     return FUNC_CALL_TEMPLATE.render(
         func_name=func_attrs["name"],
         input=input_name,
         output=output_name,
-        input_dim_names=input_dim_names,
+        outer_dim_names=outer_dim_names,
         indent=indent,
     )
diff --git a/python/aitemplate/backend/cuda/target_def.py b/python/aitemplate/backend/cuda/target_def.py
index 5b8b462ab..bc95469a5 100644
--- a/python/aitemplate/backend/cuda/target_def.py
+++ b/python/aitemplate/backend/cuda/target_def.py
@@ -16,6 +16,7 @@
 CUDA target specialization
 """
 import json
+import logging
 import os
 import pipes
 import re
@@ -26,18 +27,26 @@
 from pathlib import Path
 from typing import List
 
-from aitemplate.backend.profiler_cache import ProfileCacheDB
+from aitemplate.backend import registry
 
-from aitemplate.backend.target import TargetType
+from aitemplate.backend.profiler_cache import ProfileCacheDB
 
-from ...utils import logger
+from aitemplate.backend.target import (
+    AIT_STATIC_FILES_PATH,
+    CUTLASS_PATH,
+    Target,
+    TargetType,
+)
 
-from .. import registry
-from ..target import AIT_STATIC_FILES_PATH, CUTLASS_PATH, Target
+from aitemplate.utils import environ
+from aitemplate.utils.misc import is_debug, is_linux
 
 # pylint: disable=C0415,W0707,W0611,W0702,W1401
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class CUDA(Target):
     """CUDA target."""
 
@@ -46,6 +55,7 @@ def __init__(
         template_path=CUTLASS_PATH,
         ait_static_files_path=AIT_STATIC_FILES_PATH,
         arch="80",
+        cuda_version=None,
         **kwargs,
     ):
         """CUDA target init.
@@ -64,8 +74,15 @@ def __init__(
         self._arch = arch
         self._kwargs = kwargs
         self._compile_options = self._build_compile_options()
-
-    def _build_compile_options(self):
+        if cuda_version is None:
+            # try to set default CUDA version based on the arch
+            if arch == "80":
+                cuda_version = "11.4.2"
+            elif arch == "90":
+                cuda_version = "12.0.0"
+        self._cuda_version = cuda_version
+
+    def _build_include_directories(self) -> List[str]:
         flash_attention_path = ""
         if os.path.exists(
             os.path.join(
@@ -87,8 +104,8 @@ def _build_compile_options(self):
             os.path.join(self._template_path, "include"),
             os.path.join(self._template_path, "tools/util/include"),
             os.path.join(self._template_path, "examples/35_gemm_softmax"),
-            os.path.join(self._template_path, "examples/42_fused_multi_head_attention"),
-            os.path.join(self._template_path, "examples/43_dual_gemm"),
+            os.path.join(self._template_path, "examples/41_fused_multi_head_attention"),
+            os.path.join(self._template_path, "examples/45_dual_gemm"),
             os.path.join(
                 flash_attention_path,
                 "./",
@@ -97,32 +114,93 @@ def _build_compile_options(self):
                 flash_attention_path,
                 "fmha",
             ),
-            os.path.join(self._template_path, "../cub"),
         ]
+        ait_static_path = os.path.join(self._ait_include_path, "include/kernels")
+
+        output = [ait_static_path]
+        output.extend(cutlass_path)
+        return output
+
+    def get_include_directories(self) -> List[str]:
+        return self._build_include_directories()
+
+    def _build_gnu_host_compiler_options(self) -> List[str]:
+        return [
+            "-fPIC",
+            "-Wconversion",
+            "-fno-strict-aliasing",
+            "-fvisibility=hidden",
+        ]
+
+    def get_host_compiler_options(self) -> List[str]:
+        return self._build_gnu_host_compiler_options()
+
+    def _get_nvcc_debug_options(self) -> str:
+        CUDA_DEBUG_LEVEL_STRINGS = ["", "-lineinfo", "-g -G"]
+        level = environ.get_cuda_nvcc_debug_level()
+        if level.isdigit():
+            level = int(level)
+            assert (
+                level >= 0 and level < 3
+            ), "Debug level out of range. Must be 0 (no debug info), 1 (lineinfo) or 2 (with debug info, disable opt)"
+            return CUDA_DEBUG_LEVEL_STRINGS[level]
+        return level
+
+    def _build_nvcc_compiler_options(self) -> List[str]:
+        code = [f"sm_{self._arch}", f"compute_{self._arch}"]
+        if environ.enable_cuda_lto():
+            code += [f"lto_{self._arch}"]
         options = [
+            "-t=0",
             "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-            "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
             "-w",
-            "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
-            % (self._arch, self._arch, self._arch),
-            "-Xcompiler=-fPIC",
-            "-Xcompiler=-Wconversion",
-            "-Xcompiler=-fno-strict-aliasing",
-            "-Xcompiler -fvisibility=hidden",
-            "-O3",
+            f"-gencode=arch=compute_{self._arch},code=[{','.join(code)}]",
+            environ.get_compiler_opt_level(),
             "-std=c++17",
             "--expt-relaxed-constexpr",
-            "--use_fast_math",
-            "-I" + cutlass_path[0],
-            "-I" + cutlass_path[1],
-            "-I" + cutlass_path[2],
-            "-I" + cutlass_path[3],
-            "-I" + cutlass_path[4],
-            "-I" + cutlass_path[5],
-            "-I" + cutlass_path[6],
         ]
+        if environ.enable_ptxas_info():
+            options.extend(
+                [
+                    "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                    "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                    "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                    "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                    "--source-in-ptx",
+                ]
+            ),  # Annotate the ptx file with source information
+        options.append(self._get_nvcc_debug_options())
         if self._ndebug == 1:
             options.append("-DNDEBUG")
+        if environ.use_fast_math() and (
+            "use_fast_math" not in self._kwargs or self._kwargs["use_fast_math"]
+        ):
+            options.extend(
+                [
+                    "--use_fast_math",
+                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                    "-DAIT_USE_FAST_MATH=1",
+                ]
+            )
+        return options
+
+    def get_device_compiler_options(self) -> List[str]:
+        return self._build_nvcc_compiler_options()
+
+    def _build_compile_options(self):
+        include_paths = self._build_include_directories()
+        host_compiler_options = self._build_gnu_host_compiler_options()
+        nvcc_compiler_options = self._build_nvcc_compiler_options()
+
+        options = (
+            nvcc_compiler_options
+            + [
+                f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}"
+                for opt in host_compiler_options
+            ]
+            + ["-I" + path for path in include_paths]
+        )
+
         return " ".join(options)
 
     def src_extension(self):
@@ -147,15 +225,11 @@ def __enter__(self):
         super().__enter__()
         self._gen_cutlass_lib_pkg()
         f_gen_ops = registry.get("cuda.gen_cutlass_ops")
-        self._operators = f_gen_ops(self._arch)
+        self._operators = f_gen_ops(self._arch, self._cuda_version)
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if (
-            self.lib_folder
-            and os.path.exists(self.lib_folder)
-            and not logger.is_debug()
-        ):
+        if self.lib_folder and os.path.exists(self.lib_folder) and not is_debug():
             shutil.rmtree(self.lib_folder)
 
     def cc(self):
@@ -190,7 +264,8 @@ class FBCUDA(CUDA):
 
     nvcc_option_json = None
     cutlass_path_ = None
-    compile_options_ = None
+    static_compile_options_ = None
+    optimize_for_compilation_time_ = False
 
     def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         from libfb.py import parutil
@@ -200,6 +275,15 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
         )
         cub_src_path = parutil.get_dir_path("aitemplate/AITemplate/fb/3rdparty/cub")
         static_files_path = parutil.get_dir_path("aitemplate/AITemplate/static")
+        if "optimize_for_compilation_time" in kwargs:
+            FBCUDA.optimize_for_compilation_time_ = kwargs[
+                "optimize_for_compilation_time"
+            ]
+        _LOGGER.info(
+            "Optimize for compilation time : {}".format(
+                FBCUDA.optimize_for_compilation_time_
+            )
+        )
         self._include_path = None
         if not FBCUDA.cutlass_path_:
             self._include_path = tempfile.mkdtemp()
@@ -214,6 +298,10 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             )
             attention_include_path = self._include_path + "/att_include"
             shutil.copytree(attention_src_path, attention_include_path)
+            ait_static_include_path = self._include_path + "/static"
+            shutil.copytree(
+                static_files_path + "/include/kernels", ait_static_include_path
+            )
         self.cutlass_path_ = FBCUDA.cutlass_path_
 
         cutlass_lib_path = parutil.get_dir_path(
@@ -225,80 +313,164 @@ def __init__(self, arch="80", remote_cache_bytes=None, **kwargs):
             convert_nvcc_json = parutil.get_file_path(
                 os.path.join("aitemplate/testing", "convert_nvcc_cmd")
             )
-            logger.info(
-                __name__, f"Load the nvcc compile option from {convert_nvcc_json}"
-            )
+            _LOGGER.info(f"Load the nvcc compile option from {convert_nvcc_json}")
             with open(convert_nvcc_json, "r") as nvcc_option_json:
                 FBCUDA.nvcc_option_json = json.load(nvcc_option_json)
         self.nvcc_options_json = FBCUDA.nvcc_option_json
+        cuda_version = self.nvcc_option_json.get("cuda_version", None)
 
         self.remote_cache_bytes = remote_cache_bytes
-        super().__init__(self.cutlass_path_, static_files_path, arch, **kwargs)
+        super().__init__(
+            template_path=self.cutlass_path_,
+            ait_static_files_path=static_files_path,
+            arch=arch,
+            cuda_version=cuda_version,
+            **kwargs,
+        )
+
+    def _build_include_directories_from_sourcetree(self) -> List[str]:
+        my_path: Path = Path(os.path.realpath(__file__))  # noqa
+        ait_basepath: Path = my_path.parent.parent.parent.parent.parent.absolute()
+        assert (
+            ait_basepath.name == "AITemplate"
+        ), "AITemplate basepath resolution failed"
+        relative_include_paths = [
+            "fb/3rdparty/cutlass/examples/35_gemm_softmax",
+            "fb/3rdparty/cutlass/examples/41_fused_multi_head_attention",
+            "fb/3rdparty/cutlass/examples/45_dual_gemm",
+            "fb/3rdparty/cutlass/examples/common",
+            "fb/3rdparty/cutlass/include",
+            "fb/3rdparty/cutlass/tools/library/include",
+            "fb/3rdparty/cutlass/tools/library/src",
+            "fb/3rdparty/cutlass/tools/util/include",
+            "python/aitemplate/backend/cuda/attention/src",
+            "python/aitemplate/backend/cuda/attention/src/fmha",
+            "static/include",
+            "static/include/kernels",
+        ]
+        include_paths = [
+            str((ait_basepath / ipath).absolute()) for ipath in relative_include_paths
+        ]
+        return include_paths
+
+    def _build_include_directories(self) -> List[str]:
+        if environ.enable_include_from_sourcetree():
+            return self._build_include_directories_from_sourcetree()
+        cutlass_path = [
+            os.path.join(self._template_path, "include"),
+            os.path.join(self._template_path, "tools/util/include"),
+            os.path.join(self._template_path, "examples/35_gemm_softmax"),
+            os.path.join(self._template_path, "examples/41_fused_multi_head_attention"),
+            os.path.join(self._template_path, "examples/45_dual_gemm"),
+            os.path.join(self._template_path, "../att_include"),
+            os.path.join(self._template_path, "../att_include/fmha"),
+        ]
+        if self._include_path is not None:
+            ait_static_path = os.path.join(self._include_path, "static")
+            return [ait_static_path] + cutlass_path
+        else:
+            return cutlass_path
+
+    def get_include_directories(self) -> List[str]:
+        return self._build_include_directories()
+
+    def get_host_compiler_options(self) -> List[str]:
+        # a placeholder
+        raise NotImplementedError
+
+    def get_device_compiler_options(self) -> List[str]:
+        # a placeholder
+        raise NotImplementedError
 
     def _build_compile_options(self):
-        if not FBCUDA.compile_options_:
-            cutlass_path = [
-                os.path.join(self._template_path, "include"),
-                os.path.join(self._template_path, "tools/util/include"),
-                os.path.join(self._template_path, "examples/35_gemm_softmax"),
-                os.path.join(
-                    self._template_path, "examples/42_fused_multi_head_attention"
-                ),
-                os.path.join(self._template_path, "examples/43_dual_gemm"),
-                os.path.join(self._template_path, "../att_include"),
-                os.path.join(self._template_path, "../att_include/fmha"),
-                os.path.join(self._template_path, "../cub"),
-            ]
+        if not FBCUDA.static_compile_options_:
+            include_paths = self._build_include_directories()
             fb_include_path = os.path.join(self._include_path, "fb_include")
             pp_args = self.nvcc_options_json["pp_args"]
             with open(fb_include_path, "w") as fb_include:
                 for arg in pp_args:
                     fb_include.write(pipes.quote(arg) + "\n")
-            options = self.nvcc_options_json["args"] + [
-                "-I" + cutlass_path[0],
-                "-I" + cutlass_path[1],
-                "-I" + cutlass_path[2],
-                "-I" + cutlass_path[3],
-                "-I" + cutlass_path[4],
-                "-I" + cutlass_path[5],
-                "-I" + cutlass_path[6],
-                f"-Xcompiler '-Wp\,@{fb_include_path}'",  # noqa: W605
-                "-Xcompiler -Wno-strict-aliasing",
-                "-Xcompiler -Wno-narrowing",
-                "-Xcompiler -Wno-error=maybe-uninitialized",
-                "-Xcompiler -Wno-uninitialized",
-                "-Xcompiler -Wno-error=array-bounds",
-                "-Xcompiler -fPIC",
-                "-Xcompiler -fvisibility=hidden",
-                "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-                "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
-                "-w",
-                "--expt-relaxed-constexpr",
-                "--use_fast_math",
-                "-gencode=arch=compute_%s,code=[sm_%s,compute_%s]"
-                % (self._arch, self._arch, self._arch),
-                "-Xcompiler=-Wconversion",
-                "-O3",
-                "-std=c++17",
-            ]
+
+            nvcc_arch = self._arch
+            if nvcc_arch == "90":
+                # required by CUTLASS SM90 TMA kernels
+                nvcc_arch = "90a"
+
+            options = (
+                self.nvcc_options_json["args"]
+                + ["-I" + path for path in include_paths]
+                + [
+                    f"-Xcompiler '-Wp\\,@{fb_include_path}'",
+                    "-Xcompiler -Wno-strict-aliasing",
+                    "-Xcompiler -Wno-narrowing",
+                    "-Xcompiler -Wno-error=maybe-uninitialized",
+                    "-Xcompiler -Wno-uninitialized",
+                    "-Xcompiler -Wno-error=array-bounds",
+                    "-Xcompiler -fPIC",
+                    "-Xcompiler -fvisibility=hidden",
+                    "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
+                    "-w",
+                    "--expt-relaxed-constexpr",
+                    f"-gencode=arch=compute_{nvcc_arch},code=[sm_{nvcc_arch},compute_{nvcc_arch}]",
+                    "-Xcompiler=-Wconversion",
+                    environ.get_compiler_opt_level()
+                    if not FBCUDA.optimize_for_compilation_time_
+                    else "-O1",
+                    "-std=c++17",
+                ]
+                + (
+                    ["-DOPTIMIZE_FOR_COMPILATION_TIME"]
+                    if FBCUDA.optimize_for_compilation_time_
+                    else []
+                )
+            )
+            if environ.enable_ptxas_info():
+                options.extend(
+                    [
+                        "--keep",  # Keep the intermediate files for debugging (including ptx, sass, cubin etc.)
+                        "--ptxas-options=--warn-on-local-memory-usage",  # warn us if local memory is used in CUDA Kernels
+                        "--ptxas-options=--warn-on-spills",  # warn us if register spilling happens in CUDA Kernels
+                        "--resource-usage",  # Report on CUDA resource usage (shared mem, registers etc.)
+                        "--source-in-ptx",  # Annotate the ptx file with source information
+                    ]
+                ),
+            options.append(self._get_nvcc_debug_options())
             if self._ndebug == 1:
                 options.append("-DNDEBUG")
-            FBCUDA.compile_options_ = " ".join(options)
-        compile_options = FBCUDA.compile_options_
-        logger.debug(__name__, f"The compile options are: {compile_options}")
-        return compile_options
+            FBCUDA.static_compile_options_ = options
+        compile_options = list(FBCUDA.static_compile_options_)
+        if environ.use_fast_math() and (
+            "use_fast_math" not in self._kwargs or self._kwargs["use_fast_math"]
+        ):
+            compile_options.extend(
+                [
+                    "--use_fast_math",
+                    "-DCUTLASS_USE_TANH_FOR_SIGMOID=1",
+                    "-DAIT_USE_FAST_MATH=1",
+                ]
+            )
+        compile_options_str = " ".join(compile_options)
+        _LOGGER.info(f"The compile options are: {compile_options_str}")
+        return compile_options_str
 
     def __exit__(self, ptype, value, trace):
         super().__exit__(ptype, value, trace)
-        if not logger.is_debug() and self._include_path:
-            shutil.rmtree(self._include_path)
 
     def binary_compile_cmd(self):
         """
         There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
         """
         ld = self.nvcc_options_json["ld"]
-        return " ".join([ld, "-r -b binary -o {target} {src}"])
+        objcopy = self.nvcc_options_json["objcopy"]
+        cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
+        # Support models with >2GB constants on Linux only
+        if is_linux():
+            cmd += (
+                f" && {objcopy} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def cc(self):
         return self.nvcc_options_json["nvcc_bin"]
@@ -328,11 +500,37 @@ def list_rindex(input_list, x):
             res = f.read()
             return res
 
-    def in_ci_env(self):
+    def in_ci_env(self) -> bool:
         return (
             os.environ.get("INSIDE_RE_WORKER", None) == "1" and not self.trick_ci_env()
         )
 
+    def postprocess_build_dir(self, build_dir: str) -> None:
+        # Write a standard TARGETS file to enable standalone exe code navigation
+        from aitemplate.backend import buck_support
+
+        additional_build_dir_contents = {"TARGETS": buck_support.AIT_BUILD_DIR_TARGETS}
+        for filename, content in additional_build_dir_contents.items():
+            filepath = os.path.join(build_dir, filename)
+            with open(filepath, "w", encoding="utf-8") as f:
+                f.write(content)
+
+        if environ.enable_cuda_source_navigation_fix():
+            # We rename all .cu files to cu.h, and write a .cu
+            # file in their stead that only includes this cu.h file.
+            # The purpose is to enable .cu source navigation for certain IDEs..
+            build_dir_path = Path(build_dir)
+            cu_files = list(build_dir_path.glob("*.cu"))
+            for p in cu_files:
+                corresponding_include_file = p.with_name(p.name + ".h")
+                if corresponding_include_file.exists():
+                    corresponding_include_file.unlink()
+                # rename .cu file to .cu.h
+                p.rename(corresponding_include_file)
+                # write .cu file which just includes the original, now found
+                # under .cu.h
+                p.write_text(f'#include "{corresponding_include_file.name}"\n')
+
     @classmethod
     def remote_logger(cls, record):
         """
@@ -347,7 +545,7 @@ def remote_logger(cls, record):
             try:
                 AITemplateRemoteLogger.log(record)
             except Exception as e:
-                logger.info(__name__, f"remote_logger failed: {e}")
+                _LOGGER.info(f"remote_logger failed: {e}")
 
     def _load_profile_cache(self):
         """Load local profile cache for this target."""
@@ -356,13 +554,12 @@ def _load_profile_cache(self):
             return
 
         if self.remote_cache_bytes is not None:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 f"Loading profile cache from provided cache content with length {len(self.remote_cache_bytes)}",
             )
             with open(cache_path, "wb") as f:
                 f.write(self.remote_cache_bytes)
-        logger.info(__name__, f"Loading profile cache from: {cache_path}")
+        _LOGGER.info(f"Loading profile cache from: {cache_path}")
         self._profile_cache = ProfileCacheDB(
             TargetType(self._target_type).name, path=cache_path
         )
diff --git a/python/aitemplate/backend/cuda/tensor/__init__.py b/python/aitemplate/backend/cuda/tensor/__init__.py
index a0f93b8fe..4cef720ad 100644
--- a/python/aitemplate/backend/cuda/tensor/__init__.py
+++ b/python/aitemplate/backend/cuda/tensor/__init__.py
@@ -15,39 +15,58 @@
 """
 CUDA tensor ops module init
 """
-from . import (
+from aitemplate.backend.cuda.tensor import (
     argmax,
     batch_gather,
+    cast,
     concatenate,
     concatenate_tanh,
     dynamic_slice,
     expand,
+    full,
     gather,
+    identity,
+    index_select,
+    jagged_to_padded_dense,
+    masked_select,
+    padded_dense_to_jagged,
     permute,
     permute021,
+    permute0213,
     permute102,
     permute210,
+    relational,
     slice_reshape_scatter,
     slice_scatter,
     split,
     topk,
+    where,
 )
 
 __all__ = [
     "argmax",
     "batch_gather",
+    "cast",
     "concatenate",
     "concatenate_tanh",
     "dynamic_slice",
     "expand",
+    "full",
     "gather",
+    "relational",
+    "identity",
+    "jagged_to_padded_dense",
+    "index_select",
+    "masked_select",
+    "padded_dense_to_jagged",
     "permute",
     "permute021",
+    "permute0213",
     "permute102",
     "permute210",
     "slice_reshape_scatter",
     "slice_scatter",
     "split",
-    "argmax",
     "topk",
+    "where",
 ]
diff --git a/python/aitemplate/backend/cuda/tensor/argmax.py b/python/aitemplate/backend/cuda/tensor/argmax.py
index 9f38f2102..e6d6e58a9 100644
--- a/python/aitemplate/backend/cuda/tensor/argmax.py
+++ b/python/aitemplate/backend/cuda/tensor/argmax.py
@@ -18,17 +18,45 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import argmax_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import argmax_common
 
 # pylint: disable=C0301
 
 header_files = """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/fast_math.h"
 #include <cub/cub.cuh>
+
+using bfloat16 = nv_bfloat16;
+// if this #if statement does not evaluate to True, it is already
+// defined in cub's util_type.cuh and would be a redefinition
+#if (__CUDACC_VER_MAJOR__ < 11 && CUDA_VERSION < 11000) || _NVHPC_CUDA
+namespace cub {
+    template <>
+    struct FpLimits<bfloat16>
+    {
+        static __host__ __device__ __forceinline__ bfloat16 Max() {
+            unsigned short max_word = 0x7F7F;
+            return reinterpret_cast<bfloat16&>(max_word);
+        }
+
+        static __host__ __device__ __forceinline__ bfloat16 Lowest() {
+            unsigned short lowest_word = 0xFF7F;
+            return reinterpret_cast<bfloat16&>(lowest_word);
+        }
+    };
+
+    template <> struct NumericTraits<bfloat16>
+      : BaseTraits<FLOATING_POINT, true, false, unsigned short, bfloat16> {};
+
+    template<> struct Traits<bfloat16>
+      : NumericTraits<bfloat16> {};
+}
+#endif
 """
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/batch_gather.py b/python/aitemplate/backend/cuda/tensor/batch_gather.py
index 721bbb84b..56bc43bbb 100644
--- a/python/aitemplate/backend/cuda/tensor/batch_gather.py
+++ b/python/aitemplate/backend/cuda/tensor/batch_gather.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import batch_gather_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import batch_gather_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/tensor/cast.py b/python/aitemplate/backend/cuda/tensor/cast.py
new file mode 100644
index 000000000..bab6388fc
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/cast.py
@@ -0,0 +1,155 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+"""
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+    """
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void* y,
+    const void* x,
+    {{index_type}} n_elements,
+    {{prefix}}Stream_t stream);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}const {{index_type}} {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{output}}, {{input}},  {{func_name}}_n_elements, stream);
+{{indent}}}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void cast_op(
+    {{output_type}}* output,
+    const {{input_type}}* input,
+    {{index_type}} n_elements
+) {
+    const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx >= n_elements) {
+        return;
+    }
+    output[idx] = {{cast_func_call}}
+  }
+
+}  // namespace
+
+void invoke_{{func_name}}(void* output, const void* input,
+    {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_THREADS_PER_BLOCK));
+    cast_op<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+        reinterpret_cast<{{output_type}}*>(output),
+        reinterpret_cast<const {{input_type}}*>(input),
+        n_elements
+    );
+}
+    """
+)
+
+CAST_FUNCS = {
+    "half": {
+        "bfloat16": "__float2bfloat16_rn(__half2float(input[idx]));",
+        "float": "__half2float(input[idx]);",
+    },
+    "bfloat16": {
+        "half": "__float2half_rn(__bfloat162float(input[idx]));",
+        "float": "__bfloat162float(input[idx]);",
+    },
+    "float": {
+        "bfloat16": "__float2bfloat16_rn(input[idx]);",
+        "half": "__float2half_rn(input[idx]);",
+    },
+}
+
+
+@registry.reg("cuda.cast.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    input_ = func_attrs["inputs"][0]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    output_dtype = output.dtype()
+    output_type = backend_spec.dtype_to_backend_type(output_dtype)
+    input_type = backend_spec.dtype_to_backend_type(input_.dtype())
+    cast_func_call = CAST_FUNCS[input_type][output_type]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(),
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        output_type=output_type,
+        index_type=backend_spec.index_type,
+        cast_func_call=cast_func_call,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.cast.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.cast.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    backend_spec = CUDASpec()
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        input=func_attrs["inputs"][0]._attrs["name"],
+        calculate_n=gen_int_var_product_str(func_attrs["inputs"][0].shape()),
+        index_type=backend_spec.index_type,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate.py b/python/aitemplate/backend/cuda/tensor/concatenate.py
index 8f56b12ba..fdc7adb3c 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate.py
@@ -16,9 +16,25 @@
 CUDA concatenate function
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import concatenate_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import concatenate_common
+from aitemplate.backend.cuda.tensor import concatenate_fast
+
+
+def _is_valid_fast_cat(func_attrs):
+    """
+    Checks whether the call is acceptable for the concatenate
+    kernel in concatenate_fast.py
+    """
+
+    if "fast_cat" not in func_attrs:
+        return False
+    if not func_attrs["fast_cat"]:
+        return False
+    if len(func_attrs["inputs"]) == 0:
+        return False
+    return True
 
 
 @registry.reg("cuda.concatenate.func_decl")
@@ -56,12 +72,20 @@ def gen_function(func_attrs, element_func=None, element_func_def=None):
     str
         Rendered function body.
     """
-    return concatenate_common.gen_function(
-        func_attrs=func_attrs,
-        backend_spec=CUDASpec(),
-        element_func=element_func,
-        element_func_def=element_func_def,
-    )
+    if _is_valid_fast_cat(func_attrs):
+        return concatenate_fast.gen_function(
+            func_attrs,
+            concatenate_common.SRC_TEMPLATE,
+            element_func=element_func,
+            element_func_def=element_func_def,
+        )
+    else:
+        return concatenate_common.gen_function(
+            func_attrs=func_attrs,
+            backend_spec=CUDASpec(),
+            element_func=element_func,
+            element_func_def=element_func_def,
+        )
 
 
 @registry.reg("cuda.concatenate.func_call")
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
new file mode 100644
index 000000000..cb61f326b
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.cuh
@@ -0,0 +1,862 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#ifndef CONCATENATE_FAST_KERNEL
+#define CONCATENATE_FAST_KERNEL
+
+/////////////////////////////////////////////////////////////
+// some standard includes
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+
+// fast tanh for the most resent hardware
+#include <cutlass/fast_math.h>
+
+////////////////////////////////////////////////////////////
+// I'm trying to do as much C++ as possible in order to simplify
+//   the debugging without any Python + Jinja2.
+
+/*
+////////////////////////////////////////////////////////////////////////////////////////
+// Baseline C++ implementation looks the following.
+// Please note that it does not include masking or TensorAccessor-objects.
+// It's just a plain tensor concatenation code.
+////////////////////////////////////////////////////////////////////////////////////////
+
+// a crude representation of a tensor
+struct Tensor {
+    std::vector<int64_t> sizes;
+    std::vector<float> data;
+};
+
+// contains a list of tensors that need to be concatenated
+struct TestCase {
+    std::vector<Tensor> inputs;
+};
+
+// concatDim >= 0
+Tensor ConcatKernelDimN(const TestCase & tc, int64_t concatDim) {
+    // this is the output tensor
+    Tensor output;
+
+    // copy sizes from the first input tensor
+    output.sizes = tc.inputs[0].sizes;
+
+    // compute the resulting number of elements for dim=concatDim
+    int64_t nTotalElementsAtConcatDim = 0;
+    for (const auto & tensor : tc.inputs) {
+        nTotalElementsAtConcatDim += tensor.sizes[concatDim];
+    }
+
+    // save this new dimension
+    output.sizes[concatDim] = nTotalElementsAtConcatDim;
+
+    // concat all the data.
+    // the overall logic is the following: we need to perform
+    //   n operations, and on every iteration one copies the number
+    //   of elements proportional to ncopy.
+
+    int64_t n = 1;
+    for (int64_t i = 0; i < concatDim; i++) {
+        n *= output.sizes[i];
+    }
+
+    int64_t ncopy = 1;
+    for (int64_t i = concatDim + 1; i < output.sizes.size(); i++) {
+        ncopy *= output.sizes[i];
+    }
+
+    for (int64_t i = 0; i < n; i++) {
+        for (const auto & tensor : tc.inputs) {
+            // add a new chunk to the end of the output tensor data container
+            output.data.insert(
+                output.data.end(),
+                tensor.data.cbegin() +
+                    i * tensor.sizes[concatDim] * ncopy,
+                tensor.data.cbegin() +
+                    (i + 1) * tensor.sizes[concatDim] * ncopy);
+        }
+    }
+
+    // done
+    return output;
+}
+*/
+
+////////////////////////////////////////////////////////////
+// Here go the facilities that are responsible for post-processing,
+//   such as applying tanh on top of values on a concatenated tensor.
+
+// does no processing
+template <typename DataT>
+struct NoopTransform {
+  using data_type = DataT;
+  __device__ inline data_type operator()(const data_type value) {
+    return value;
+  }
+};
+
+// does tanh()
+template <typename DataT>
+struct TanhTransform {
+  using data_type = DataT;
+};
+
+template <>
+struct TanhTransform<float> {
+  using data_type = float;
+  __device__ inline float operator()(const float value) {
+    // return tanhf(value);
+    return cutlass::fast_tanh(value);
+  }
+};
+
+template <>
+struct TanhTransform<half> {
+  using data_type = half;
+  __device__ inline half operator()(const half value) {
+    // return __float2half(tanhf(__half2float(value)));
+    return cutlass::fast_tanh(value);
+  }
+};
+
+template <>
+struct TanhTransform<__nv_bfloat16> {
+  using data_type = __nv_bfloat16;
+  __device__ inline __nv_bfloat16 operator()(const __nv_bfloat16 value) {
+    return __float2bfloat16(tanhf(__bfloat162float(value)));
+  }
+};
+
+// CUDA-based hardware benefits not only from coalescing, but from
+//   reading/writing in memory-aligned chunks. This template defined
+//   the type which is used for reading/writing. For example, float2
+//   and float4 are built-in CUDA types, so compiler will assume that
+//   these types are address-aligned and issue 64-bit or 128-bit
+//   read operations instead of 32-bit one.
+
+template <int32_t AlignmentInBytes>
+struct RWChunkTrait {};
+
+template <>
+struct RWChunkTrait<2> {
+  using chunk_type = half;
+};
+template <>
+struct RWChunkTrait<4> {
+  using chunk_type = float;
+};
+template <>
+struct RWChunkTrait<8> {
+  using chunk_type = float2;
+};
+template <>
+struct RWChunkTrait<16> {
+  using chunk_type = float4;
+};
+// This one is introduced, despite current CUDA hardware
+// is capable of doing only 128 bit transfers. Benchmarks
+// showed that doing 2x float4 is faster than 1x or 4x.
+// Maybe, 3x needs to be benchmarked as well.
+template <>
+struct RWChunkTrait<32> {
+  using chunk_type = float4;
+};
+
+// This is a piece of tensor data that is going to be read or written.
+//   The purpose is to organize read/write operations is the way to
+//   maximize the number of aligned 128 bit/64 bit/32 bit reads.
+// For example, AlignedChunk<half, 32> means that:
+//   * our tensor works with element of datatype half
+//   * it is guaranteed that it is possible to read 16 contiguous elements
+//   * the address of the first element is aligned to 32 bytes
+// Thus, float4 will be deduced as an underlying data type for interacting
+//   with the global memory and the read/write operations will be
+//   performed via 128 bit reads.
+template <typename DataT, int32_t AlignmentInBytes>
+struct alignas(AlignmentInBytes) AlignedChunk {
+  // This is the type of the data of a tensor elements. Most likely, it is
+  //   float, half or bf16.
+  using data_type = DataT;
+  static constexpr int32_t NElements = AlignmentInBytes / sizeof(data_type);
+
+  // This is the type used for interacting with the global memory.
+  using chunk_type = typename RWChunkTrait<AlignmentInBytes>::chunk_type;
+  static constexpr int32_t NChunkElements =
+      AlignmentInBytes / sizeof(chunk_type);
+
+  using self_type = AlignedChunk<DataT, AlignmentInBytes>;
+
+  // the data itself
+  union {
+    // this is for accessing and applying transformations like tanh()
+    data_type data[NElements];
+    // this is for reading/writing
+    chunk_type chunks[NChunkElements];
+  } holder;
+
+  // read from the global memory
+  __device__ inline void load(const void* const src) {
+    auto srcMod = reinterpret_cast<const chunk_type*>(src);
+#pragma unroll NChunkElements
+    for (int32_t i = 0; i < NChunkElements; i++) {
+      holder.chunks[i] = srcMod[i];
+    }
+  }
+
+  // transform the elements
+  template <typename TransformT>
+  __device__ inline void transform() {
+    TransformT transform;
+
+#pragma unroll NElements
+    for (int32_t i = 0; i < NElements; i++) {
+      holder.data[i] = transform(holder.data[i]);
+    }
+  }
+
+  // write to the global memory
+  __device__ inline void store(void* const dst) const {
+    auto dstMod = reinterpret_cast<chunk_type*>(dst);
+#pragma unroll NChunkElements
+    for (int32_t i = 0; i < NChunkElements; i++) {
+      dstMod[i] = holder.chunks[i];
+    }
+  }
+
+  // This operation is needed to merge AlignedChunk items.
+  // Say, we read an input tensor as AlignedChunk<half, 8>
+  //   and we write into an output tensor as AlignedChunk<half, 32>.
+  //   So, it is possible to merge AlignedChunk<half, 8>[4] into
+  //   a single AlignedChunk<half, 32>.
+  // The compiler does nothing but just the register reassignment.
+  template <typename OtherChunkT, int32_t M>
+  __device__ inline void copyFrom(const OtherChunkT other[M]) {
+    // TODO: this function needs to perform a type conversion
+    //   if the types are different. Via if constexpr, I suppose.
+    // Say, the input tensor uses half data type, and the output tensor
+    //   uses float one.
+    static_assert(std::is_same_v<typename OtherChunkT::data_type, data_type>);
+
+    const data_type* otherAddr = (const data_type*)(other);
+#pragma unroll NElements
+    for (int32_t i = 0; i < NElements; i++) {
+      holder.data[i] = otherAddr[i];
+    }
+  }
+};
+
+// TODO: This can be improved to have less read/write operations.
+// As of now, AlignedChunk is read using the same primitive
+// type all the time. Technically, it can be reorganized to have
+// multiple underlying chunk types.
+// Say, something like AlignedChunkPlusPlus<half, 32, 8, 16, 8>
+//  that reads 8b + 16b + 8b may be used in future instead of
+//  4x AlignedChunk<half, 8> that reads 8b + 8b + 8b + 8b,
+//  if the alignment allows it.
+
+// A simple 1D fixed-size array.
+// One needs to be cautions about pointers, because nvcc compiler
+//   does not apply __restrict correctly for the array of pointers
+//   or structs.
+template <typename DataT, int32_t N>
+struct FSArray {
+  DataT data[N];
+};
+
+// clang-format off
+
+// The most general kernel that supports all the features, but the slowest one.
+// The kernel is organized in the form so that every thread writes a single
+//   ChunkOutputT value to the output tensor.
+// A single write op into the output tensor is supported with the
+//   one or multiple read ops from one of the input tensors.
+//
+// The template parameters are the following:
+// * ChunkOutputT is an aligned data type which is used for
+//   writing into the output tensor. We want this one to be as large as possible
+//   in order to minimize the number of writing ops.
+//   It is guaranteed that all the writing ops are aligned for the addresses.
+//   AlignedChunk<T, M> is used for this.
+// * ChunkInputT is an aligned data type which is used for
+//   reading from input tensors. We want this one to be as large as possible
+//   in order to minimize the number of reading ops.
+//   It is guaranteed that all the reading ops are aligned for the addresses
+//   of all input tensors.
+//   Also, sizeof(ChunkInputT) <= sizeof(ChunkOutputT)
+//   AlignedChunk<T, M> is used for this.
+// * IndexT is a pointer size type. It is either int32_t or int64_t.
+//   It is beneficial to use int32_t unless super-large tensors are used.
+// * NInputTensors is a number of input tensors.
+template <
+    typename ChunkInputT,
+    typename ChunkOutputT,
+    typename IndexT,
+    int32_t NInputTensors,
+    typename TransformT>
+__global__ void ConcatKernelGeneralized(
+    // pointers to the data of input tensors
+    const FSArray<const typename ChunkInputT::data_type*, NInputTensors>
+        inputDatas,
+    // TensorAccessor.original_total_elements_from_stride_dim values
+    //   for input tensors, Please reference tensor_accessor.cuh file.
+    const FSArray<IndexT, NInputTensors> originalTE,
+    // TensorAccessor.actual_total_elements_from_stride_dim values
+    //   for input tensors. Please reference tensor_accessor.cuh file.
+    const FSArray<IndexT, NInputTensors> actualTE,
+    // The sum of input tensor sizes for dim=concatDim.
+    //   This equals to the output tensor size for dim=concatDim.
+    const IndexT outputSizeAtConcatDimMultipliedByNCopy,
+    // The stride for output tensor for dim=concatDim.
+    //   This equals to outputSizeAtConcatDim if there were no masked inputs.
+    const IndexT strideMultipliedByNCopy,
+    // Postfix sum of tensor sizes for dim=concatDim.
+    //   All the values are were multiplied by nCopy.
+    const FSArray<IndexT, NInputTensors> concatDimPostfixSumMultipliedByNCopy,
+    // Every input tensor is expected to get written on a certain
+    //   offset of the output tensor. These are needed if masks are used,
+    //   otherwise ones may be skipped.
+    const FSArray<IndexT, NInputTensors> outputConcatDimOffsetsMultipliedByNCopy,
+    // Where to write the output to.
+    typename ChunkOutputT::data_type* const __restrict outputData,
+    // the total amount of elements to populate in the output tensor.
+    const IndexT numOutputElements) {
+  // some typedefs
+  using input_data_type = typename ChunkInputT::data_type;
+  using output_data_type = typename ChunkOutputT::data_type;
+
+  // put the input values into shared memory.
+  __shared__ IndexT shared_concatDimPostfixSumMultipliedByNCopy[NInputTensors];
+  __shared__ IndexT shared_originalTE[NInputTensors];
+  __shared__ IndexT shared_actualTE[NInputTensors];
+  __shared__ IndexT shared_outputConcatDimOffsetsMultipliedByNCopy[NInputTensors];
+  __shared__ const input_data_type* shared_inputDatas[NInputTensors];
+
+  if (threadIdx.x == 0) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_concatDimPostfixSumMultipliedByNCopy[i] = concatDimPostfixSumMultipliedByNCopy.data[i];
+    }
+  } else if (threadIdx.x == 1) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_originalTE[i] = originalTE.data[i];
+    }
+  } else if (threadIdx.x == 2) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_actualTE[i] = actualTE.data[i];
+    }
+  } else if (threadIdx.x == 3) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_outputConcatDimOffsetsMultipliedByNCopy[i] = outputConcatDimOffsetsMultipliedByNCopy.data[i];
+    }
+  } else if (threadIdx.x == 4) {
+#pragma unroll NInputTensors
+    for (int32_t i = 0; i < NInputTensors; i++) {
+      shared_inputDatas[i] = inputDatas.data[i];
+    }
+  }
+
+  __syncthreads();
+
+  // Every thread handles a single ChunkOutputT element, or
+  // ChunkOutputT::NElements of an output tensor;
+  const IndexT tid = ((IndexT)blockIdx.x * (IndexT)blockDim.x + threadIdx.x) *
+      ChunkOutputT::NElements;
+  if (tid >= numOutputElements) {
+    return;
+  }
+
+  // calculate the location of the output tensor a current thread is writing to:
+  //   outputRowIdx is the row
+  //   outputColumnIdx is the column
+
+  const IndexT outputRowIdx = tid / (outputSizeAtConcatDimMultipliedByNCopy);
+
+  // Find the input tensor to use
+  const IndexT offset = tid % (outputSizeAtConcatDimMultipliedByNCopy);
+  int32_t inputTensorIdx = 0;
+#pragma unroll NInputTensors
+  for (int32_t i = 1; i < NInputTensors; i++) {
+    inputTensorIdx = (offset < shared_concatDimPostfixSumMultipliedByNCopy[i - 1]) ? inputTensorIdx : i;
+  }
+
+  const IndexT subtract = (inputTensorIdx == 0) ? 0 : shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx - 1];
+  const IndexT outputColumnIdx = offset - subtract;
+
+  // Load the TensorAccessor.original_total_elements_from_stride_dim and
+  //   TensorAccessor.actual_total_elements_from_stride_dim values
+  //   for the current tensor.
+
+  IndexT originalTEValue = shared_originalTE[inputTensorIdx];
+  IndexT actualTEValue = shared_actualTE[inputTensorIdx];
+
+  // Calculate the contiguous access index of the current input tensor
+  IndexT readPositionContiguous =
+        (inputTensorIdx == 0) ?
+        shared_concatDimPostfixSumMultipliedByNCopy[0] :
+        (shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx] - shared_concatDimPostfixSumMultipliedByNCopy[inputTensorIdx - 1]);
+  readPositionContiguous = outputRowIdx * readPositionContiguous + outputColumnIdx;
+
+  // Get the pointer to data of the input tensor
+  const input_data_type* __restrict inputData = shared_inputDatas[inputTensorIdx];
+
+  // Ok, what's the number of read operations from an input tensor
+  //   needed for a single write operation for the output tensor?
+  constexpr int32_t N_READ_OPS = ChunkOutputT::NElements / ChunkInputT::NElements;
+
+  // Allocate a temporary buffer and perform all these read ops
+  ChunkInputT inputValues[N_READ_OPS];
+
+  // don't merge these two branches, it is slower
+  if (actualTEValue != originalTEValue) {
+    TensorAccessor inputTA{0, false, 0, originalTEValue, actualTEValue};
+
+#pragma unroll N_READ_OPS
+    for (int32_t i = 0; i < N_READ_OPS; i++) {
+      // each read op reads ChunkInputT::NElements elements from an input tensor
+      const input_data_type* const __restrict srcp =
+        inputTA.template get<const input_data_type, const input_data_type>(
+          inputData,
+          readPositionContiguous + i * ChunkInputT::NElements
+        );
+
+      inputValues[i].load(srcp);
+    }
+  }
+  else {
+    TensorAccessor inputTA{0, true, 0, 0, 0};
+
+#pragma unroll N_READ_OPS
+    for (int32_t i = 0; i < N_READ_OPS; i++) {
+      // each read op reads ChunkInputT::NElements elements from an input tensor
+      const input_data_type* const __restrict srcp =
+        inputTA.template get<const input_data_type, const input_data_type>(
+          inputData,
+          readPositionContiguous + i * ChunkInputT::NElements
+        );
+
+      inputValues[i].load(srcp);
+    }
+  }
+
+  // combine all the input data
+  ChunkOutputT outputChunk;
+  outputChunk.template copyFrom<ChunkInputT, N_READ_OPS>(inputValues);
+
+  // transform
+  outputChunk.template transform<TransformT>();
+
+  // Find a destination offset for the output tensor
+  IndexT outputOffsetMultipliedByNCopy = shared_outputConcatDimOffsetsMultipliedByNCopy[inputTensorIdx];
+  ChunkOutputT* const __restrict outputAddr = reinterpret_cast<ChunkOutputT*>(outputData);
+
+  // perform a write operation
+  const IndexT outputWritePosition = outputRowIdx * strideMultipliedByNCopy + outputColumnIdx;
+
+  const IndexT op = outputOffsetMultipliedByNCopy + outputWritePosition;
+  outputChunk.store(outputAddr + op / ChunkOutputT::NElements);
+}
+
+// utility functions
+size_t getAlignment(const void* const inputData) {
+    uintptr_t ptr = (uintptr_t)(inputData);
+    if ((ptr % 32) == 0) { return 32; }
+    if ((ptr % 16) == 0) { return 16; }
+    if ((ptr % 8) == 0) { return 8; }
+    if ((ptr % 4) == 0) { return 4; }
+    if ((ptr % 2) == 0) { return 2; }
+
+    return 1;
+}
+
+size_t getAlignment(const size_t n) {
+    if ((n % 32) == 0) { return 32; }
+    if ((n % 16) == 0) { return 16; }
+    if ((n % 8) == 0) { return 8; }
+    if ((n % 4) == 0) { return 4; }
+    if ((n % 2) == 0) { return 2; }
+
+    return 1;
+}
+
+// clang-format on
+
+//
+template <
+    typename ChunkInputT,
+    typename ChunkOutputT,
+    typename IndexT,
+    int32_t NInputTensors,
+    size_t NRank,
+    typename TransformT>
+void concatenateFastLauncher(
+    const int64_t* inputDim[],
+    const void* const inputData[NInputTensors],
+    const int64_t inputConcatDimOffsets[],
+    const int64_t originalTE[],
+    const int64_t actualTE[],
+    const int64_t outputDim[NRank],
+    const int64_t outputConcatDimOffsets[],
+    void* const outputData,
+    const size_t concatDim,
+    char* func_name,
+    cudaStream_t stream) {
+  // some typedefs
+  using input_data_type = typename ChunkInputT::data_type;
+  using output_data_type = typename ChunkOutputT::data_type;
+
+  // assign input tensors
+  FSArray<const input_data_type*, NInputTensors> inputDataFS;
+  for (size_t iTensor = 0; iTensor < NInputTensors; iTensor++) {
+    inputDataFS.data[iTensor] =
+        reinterpret_cast<const input_data_type*>(inputData[iTensor]);
+  }
+
+  // compute ncopy
+  int64_t ncopy = 1;
+  for (size_t i = concatDim + 1; i < NRank; i++) {
+    ncopy *= outputDim[i];
+  }
+
+  // copy
+  FSArray<IndexT, NInputTensors> inputConcatDimOffsetsFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    inputConcatDimOffsetsFS.data[j] = inputConcatDimOffsets[j];
+  }
+
+  FSArray<IndexT, NInputTensors> originalTEFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    originalTEFS.data[j] = originalTE[j];
+  }
+
+  FSArray<IndexT, NInputTensors> actualTEFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    actualTEFS.data[j] = actualTE[j];
+  }
+
+  FSArray<IndexT, NInputTensors> outputConcatDimOffsetsMultipliedByNCopyFS;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    outputConcatDimOffsetsMultipliedByNCopyFS.data[j] =
+        outputConcatDimOffsets[j] * ncopy;
+  }
+
+  // compute postfix sum.
+  FSArray<IndexT, NInputTensors> concatDimPostfixSumMultipliedByNCopy;
+  {
+    int64_t current = 0;
+    for (size_t j = 0; j < NInputTensors; j++) {
+      auto dim = inputDim[j][concatDim];
+      current += dim;
+      concatDimPostfixSumMultipliedByNCopy.data[j] = (IndexT)(current * ncopy);
+    }
+  }
+
+  // this is the number of elements that needs to be filled on
+  //   dim=concatDim. Basically, it is the sum of available elements
+  //   on dim=concatDim for all of the inputs.
+  // also, this is the number of ncopy-sized chunks that needs to be processed
+  //   per single row of an output tensor. So, every row processes
+  //   nElementsAtConcatDim * ncopy elements.
+  int64_t nElementsAtConcatDim = 0;
+  for (size_t j = 0; j < NInputTensors; j++) {
+    auto dim = inputDim[j][concatDim];
+    nElementsAtConcatDim += dim;
+  }
+
+  // the total number of output elements that needs to be processed
+  int64_t numOutputElements = 1;
+  // the number of rows...
+  for (int32_t iRank = 0; iRank < concatDim; iRank++) {
+    numOutputElements *= outputDim[iRank];
+  }
+
+  // ... multiplied by the number of elements per row
+  numOutputElements *= nElementsAtConcatDim;
+  numOutputElements *= ncopy;
+
+  if (numOutputElements == 0) {
+    // nothing to do
+    return;
+  }
+
+  // this is the stride for dim=concatDim. Basically, the amount of
+  //   memory allocated for a single output tensor row.
+  // stride != nElementsAtConcatDim if some inputs were originally masked out.
+  int64_t stride = outputDim[concatDim];
+
+  // run the CUDA kernel
+  const int32_t nThreadsPerBlock = 128;
+  const int64_t effNumOutputElements =
+      numOutputElements / ChunkOutputT::NElements;
+  const int32_t nBlocks =
+      (effNumOutputElements + nThreadsPerBlock - 1) / nThreadsPerBlock;
+
+  // // tell some debug information
+  // printf(
+  //     "I am %s v2 with %ld elements, %d inputs, %zd ChunkInputT, "
+  //     "%zd ChunkOutputT, "
+  //     "%zd InputDataT, %zd OutputDataT\n",
+  //     func_name,
+  //     effNumOutputElements,
+  //     (int32_t)NInputTensors,
+  //     sizeof(ChunkInputT),
+  //     sizeof(ChunkOutputT),
+  //     sizeof(input_data_type),
+  //     sizeof(output_data_type));
+
+  ConcatKernelGeneralized<
+      ChunkInputT,
+      ChunkOutputT,
+      IndexT,
+      NInputTensors,
+      TransformT><<<nBlocks, nThreadsPerBlock, 0, stream>>>(
+      inputDataFS,
+      originalTEFS,
+      actualTEFS,
+      nElementsAtConcatDim * ncopy,
+      stride * ncopy,
+      concatDimPostfixSumMultipliedByNCopy,
+      outputConcatDimOffsetsMultipliedByNCopyFS,
+      reinterpret_cast<output_data_type*>(outputData),
+      numOutputElements);
+}
+
+template <
+    typename InputDataT,
+    typename OutputDataT,
+    size_t NInputTensors,
+    size_t NRank,
+    typename TransformT>
+void invoke_concatenate_fast(
+    const int64_t* inputDim[],
+    const void* const inputData[NInputTensors],
+    const TensorAccessor* inputTensorAccessors[NInputTensors],
+    const int64_t outputDim[NRank],
+    const int64_t outputConcatDimOffsets[],
+    void* const outputData,
+    const size_t concatDim,
+    char* func_name,
+    cudaStream_t stream) {
+  // check the input parameters
+  if (NInputTensors == 0 || NRank == 0) {
+    return;
+  }
+  if (outputData == nullptr) {
+    throw std::runtime_error("output is nullptr!");
+  }
+
+  // every thread in a kernel may copy up to ncopy elements
+  //   in a single copy operation
+  int64_t ncopy = 1;
+  for (size_t i = concatDim + 1; i < NRank; i++) {
+    ncopy *= outputDim[i];
+  }
+
+  // Compute the alignment of our output dataset
+  // the alignment of the base address
+  size_t alignmentOutput = getAlignment(outputData);
+
+  // The alignment for the amount of copied data
+  alignmentOutput = std::min(
+      alignmentOutput,
+      getAlignment(outputDim[concatDim] * ncopy * sizeof(OutputDataT)));
+
+  // Input tensor i will be copied to a location that starts from
+  //   a column outputConcatDimOffsets[i]. Compute its alignment
+  for (size_t i = 0; i < NInputTensors; i++) {
+    alignmentOutput = std::min(
+        alignmentOutput,
+        getAlignment(outputConcatDimOffsets[i] * ncopy * sizeof(OutputDataT)));
+  }
+
+  //
+  const void* inputDataWithOffsets[NInputTensors];
+
+  int64_t originalTE[NInputTensors];
+  int64_t actualTE[NInputTensors];
+
+  int64_t inputConcatDimOffsets[NInputTensors];
+
+  //
+  size_t alignmentInputs = 65536;
+  for (size_t i = 0; i < NInputTensors; i++) {
+    alignmentInputs = std::min(
+        alignmentInputs,
+        getAlignment(inputDim[i][concatDim] * ncopy * sizeof(InputDataT)));
+    alignmentOutput = std::min(
+        alignmentOutput,
+        getAlignment(inputDim[i][concatDim] * ncopy * sizeof(OutputDataT)));
+  }
+
+  for (size_t j = 0; j < NInputTensors; j++) {
+    const auto* tensorAccessor = inputTensorAccessors[j];
+
+    // recompute the inputData with respect to offset
+    inputDataWithOffsets[j] =
+        ((InputDataT*)inputData[j]) + tensorAccessor->offset;
+
+    // alter its alignment
+    alignmentInputs =
+        std::min(alignmentInputs, getAlignment(inputDataWithOffsets[j]));
+
+    // is input tensor implies a contiguous access?
+    if (tensorAccessor->is_contiguous) {
+      // yes
+      inputConcatDimOffsets[j] = inputDim[j][concatDim];
+
+      originalTE[j] = inputDim[j][concatDim];
+      actualTE[j] = inputDim[j][concatDim];
+    } else {
+      // no
+      if (tensorAccessor->stride_dim == -1) {
+        throw std::runtime_error(
+            "Unsupported negative tensorAccessor stride_dim value!");
+      } else {
+        inputConcatDimOffsets[j] =
+            tensorAccessor->actual_total_elements_from_stride_dim;
+        originalTE[j] = tensorAccessor->original_total_elements_from_stride_dim;
+        actualTE[j] = tensorAccessor->actual_total_elements_from_stride_dim;
+
+        // ncopy?
+        alignmentInputs = std::min(
+            alignmentInputs, getAlignment(originalTE[j] * sizeof(InputDataT)));
+        alignmentInputs = std::min(
+            alignmentInputs, getAlignment(actualTE[j] * sizeof(InputDataT)));
+      }
+    }
+  }
+
+  if (alignmentOutput < alignmentInputs) {
+    // // TODO: this is a possible optimization, bcz the current kernel
+    // supports N reads ops per 1 write op, but not 1 read op per N write ops.
+    // printf(
+    //     "SHRINK, AlignmentInputs = %zd, AlignmentOutput = %zd\n",
+    //     (size_t)alignmentInputs,
+    //     (size_t)alignmentOutput);
+    alignmentInputs = alignmentOutput;
+  }
+
+  if (alignmentInputs == 1) {
+    // unsupported yet. todo
+    throw std::runtime_error("Unsupported input tensors alignment!");
+  }
+  if (alignmentOutput == 1) {
+    // unsupported yet. todo
+    throw std::runtime_error("Unsupported output tensor alignment!");
+  }
+
+#define LAUNCHER(ALIGNMENT_INPUT, ALIGNMENT_OUTPUT, INDEX_T)            \
+  if (alignmentOutput == ALIGNMENT_OUTPUT &&                            \
+      alignmentInputs == ALIGNMENT_INPUT) {                             \
+    if constexpr (                                                      \
+        sizeof(InputDataT) <= ALIGNMENT_INPUT &&                        \
+        sizeof(OutputDataT) <= ALIGNMENT_OUTPUT) {                      \
+      using InputChunkT = AlignedChunk<InputDataT, ALIGNMENT_INPUT>;    \
+      using OutputChunkT = AlignedChunk<OutputDataT, ALIGNMENT_OUTPUT>; \
+      concatenateFastLauncher<                                          \
+          InputChunkT,                                                  \
+          OutputChunkT,                                                 \
+          INDEX_T,                                                      \
+          NInputTensors,                                                \
+          NRank,                                                        \
+          TransformT>(                                                  \
+          inputDim,                                                     \
+          inputDataWithOffsets,                                         \
+          inputConcatDimOffsets,                                        \
+          originalTE,                                                   \
+          actualTE,                                                     \
+          outputDim,                                                    \
+          outputConcatDimOffsets,                                       \
+          outputData,                                                   \
+          concatDim,                                                    \
+          func_name,                                                    \
+          stream);                                                      \
+      return;                                                           \
+    }                                                                   \
+  }
+
+  // compute the limit of the number of elements in output tensor
+  int64_t numOutputElements = 1;
+  for (size_t iRank = 0; iRank < NRank; iRank++) {
+    numOutputElements *= outputDim[iRank];
+  }
+
+  if (numOutputElements == 0) {
+    // no elements to process
+    return;
+  }
+
+  // TODO: rework the following if condition.
+  // 1. This value is a constexpr value, because all the
+  // input & output tensor sizes are known to a template generator.
+  // This improvement should reduce the compilation speed 2x.
+  // 2. Strided tensors might need a special handling.
+  if (!can_use_32bit_index_math(numOutputElements)) {
+    using index_type = int64_t;
+
+    LAUNCHER(32, 32, index_type);
+    LAUNCHER(16, 32, index_type);
+    LAUNCHER(8, 32, index_type);
+    LAUNCHER(4, 32, index_type);
+    LAUNCHER(2, 32, index_type);
+
+    LAUNCHER(16, 16, index_type);
+    LAUNCHER(8, 16, index_type);
+    LAUNCHER(4, 16, index_type);
+    LAUNCHER(2, 16, index_type);
+
+    LAUNCHER(8, 8, index_type);
+    LAUNCHER(4, 8, index_type);
+    LAUNCHER(2, 8, index_type);
+
+    LAUNCHER(4, 4, index_type);
+    LAUNCHER(2, 4, index_type);
+
+    LAUNCHER(2, 2, index_type);
+  } else {
+    using index_type = int32_t;
+
+    LAUNCHER(32, 32, index_type);
+    LAUNCHER(16, 32, index_type);
+    LAUNCHER(8, 32, index_type);
+    LAUNCHER(4, 32, index_type);
+    LAUNCHER(2, 32, index_type);
+
+    LAUNCHER(16, 16, index_type);
+    LAUNCHER(8, 16, index_type);
+    LAUNCHER(4, 16, index_type);
+    LAUNCHER(2, 16, index_type);
+
+    LAUNCHER(8, 8, index_type);
+    LAUNCHER(4, 8, index_type);
+    LAUNCHER(2, 8, index_type);
+
+    LAUNCHER(4, 4, index_type);
+    LAUNCHER(2, 4, index_type);
+
+    LAUNCHER(2, 2, index_type);
+  }
+
+  // no launcher was found
+  throw std::runtime_error("Unsupported concat kernel specialization!");
+}
+
+#endif
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_fast.py b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
new file mode 100644
index 000000000..fb58b6bac
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_fast.py
@@ -0,0 +1,195 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+
+import jinja2
+
+from aitemplate.backend.backend_spec import CUDASpec
+
+from aitemplate.backend.common import tensor_accessor_codegen
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.ops.tensor import concatenate
+
+
+KERNEL_SRC_TEMPLATE = jinja2.Template(
+    """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "logging.h"
+
+{{header_src}}
+
+{% if element_func_def %}
+{{element_func_def}}
+{% endif %}
+
+namespace {
+
+{{tensor_accessor_libs}}
+
+// TODO: support strided tensor with TensorAccessor
+// For strided tensor, the index can be much larger than original if the stride is large
+bool can_use_32bit_index_math(const int64_t elements, int64_t max_elem=std::numeric_limits<int32_t>::max()) {
+  if (elements >= max_elem) {
+    return false;
+  }
+  if (elements == 0) {
+    return max_elem > 0;
+  }
+
+  return true;
+}
+
+__host__ __device__ __forceinline__
+int64_t get_num_elems(const {{index_type}} *shape, {{index_type}} rank) {
+  int64_t num = 1;
+  for ({{index_type}} i = 0; i < rank; i++) {
+    num *= shape[i];
+  }
+  return num;
+}
+
+{{custom_libs}}
+
+}  // namespace
+
+"""
+)
+
+
+EXEC_COND_TEMPLATE = jinja2.Template(
+    """
+
+{{input_accessor_defs}}
+
+{{indent}}{{index_type}} local_output_shape[] = {
+{% for idx in range(rank - 1) %}
+{{indent}}  *(output_shape[{{idx}}]),
+{% endfor %}
+{{indent}}  *(output_shape[{{rank - 1}}])
+{{indent}}};
+{{indent}}
+{{indent}}{% if element_func == "fast_tanh" %}
+{{indent}}using transform_type = TanhTransform<{{elem_type}}>;
+{{indent}}{% else %}
+{{indent}}using transform_type = NoopTransform<{{elem_type}}>;
+{{indent}}{% endif %}
+{{indent}}
+{{indent}}invoke_concatenate_fast<{{elem_type}}, {{elem_type}}, {{num_all_inputs}}, {{rank}}, transform_type>(
+{{indent}}    real_input_shapes,
+{{indent}}    inputs,
+{{indent}}    input_accessors,
+{{indent}}    local_output_shape,
+{{indent}}    concat_dim_offsets.data(),
+{{indent}}    output,
+{{indent}}    concat_dim,
+{{indent}}    "{{func_name}}",
+{{indent}}    stream);
+{{indent}}return;
+"""
+)
+
+INPUT_ACCESSOR_DEFS_TEMPLATE = jinja2.Template(
+    """
+{{input_accessors}}
+
+{{indent}}const TensorAccessor *input_accessors[{{num_real_inputs}}] = {
+
+{{indent}}  {{input_accessor_refs}}
+
+{{indent}}};
+"""
+)
+
+
+def gen_function(
+    func_attrs,
+    src_template,
+    element_func=None,
+    element_func_def=None,
+):
+    backend_spec = CUDASpec()
+
+    inputs = func_attrs["inputs"]
+    original_inputs = func_attrs["original_inputs"]
+    concatenate.check_rank(original_inputs, func_attrs["concat_dim"])
+    orig_x = original_inputs[0]
+    y = func_attrs["outputs"][0]
+    x_shape = orig_x._attrs["shape"]
+
+    input_type = backend_spec.dtype_to_backend_type(orig_x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+
+    # TODO: support type cast
+    if input_type != output_type:
+        raise NotImplementedError("input type must equal to output type")
+
+    concat_dim = func_attrs["concat_dim"]
+    assert concat_dim < len(x_shape)
+
+    input_accessors = []
+    input_accessor_refs = []
+    for i in range(len(inputs)):
+        accessor_name = f"input_accessor{i}"
+        input_accessor_refs.append(f"&{accessor_name}")
+        input_accessors.append(
+            tensor_accessor_codegen.TENSOR_ACCESSOR_TEMPLATE.render(
+                name=accessor_name, tensor_accessor=func_attrs["input_accessors"][i]
+            )
+        )
+    input_accessor_defs = INPUT_ACCESSOR_DEFS_TEMPLATE.render(
+        indent="    ",
+        input_accessors="".join(input_accessors),
+        num_real_inputs=len(inputs),
+        input_accessor_refs=", ".join(input_accessor_refs),
+    )
+
+    # load the file from the drive
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "concatenate_fast.cuh"
+    )
+
+    header_src = backend_spec.header_src_template.render()
+    tensor_accessor_libs = tensor_accessor_codegen.get_libs()
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        custom_libs=custom_libs,
+        element_func=element_func,
+        element_func_def=element_func_def,
+        header_src=header_src,
+        index_type=backend_spec.index_type,
+        tensor_accessor_libs=tensor_accessor_libs,
+    )
+    exec_paths = EXEC_COND_TEMPLATE.render(
+        indent="  ",
+        rank=len(x_shape),
+        num_all_inputs=len(inputs),
+        elem_type=input_type,
+        element_func=element_func,
+        element_func_def=element_func_def,
+        index_type=backend_spec.index_type,
+        input_accessor_defs=input_accessor_defs,
+        func_name=func_attrs["name"],
+    )
+
+    return src_template.render(
+        kernel_src=kernel_src,
+        func_name=func_attrs["name"],
+        exec_paths=exec_paths,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
index efb2ea440..833bc9c22 100644
--- a/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
+++ b/python/aitemplate/backend/cuda/tensor/concatenate_tanh.py
@@ -17,13 +17,16 @@
 """
 import jinja2
 
-from ... import registry
-from . import concatenate
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.tensor import concatenate
+
 
 TANH_DEF = jinja2.Template(
     """
 #include <cutlass/fast_math.h>
 
+{% if dtype == "half" %}
 #ifndef __HALF2_TO_UI
 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
 #endif
@@ -81,7 +84,27 @@
     y_vec[3] = fast_tanh(x_vec[3]);
     return y;
 }
+{% elif dtype == "float" %}
+__device__  float fast_tanh(float x) {
+    return cutlass::fast_tanh(x);
+}
+
+__device__  float2 fast_tanh(float2 x) {
+    float2 y;
+    y.x = cutlass::fast_tanh(x.x);
+    y.y = cutlass::fast_tanh(x.y);
+    return y;
+}
 
+__device__  float4 fast_tanh(float4 x) {
+    float4 y;
+    y.x = cutlass::fast_tanh(x.x);
+    y.y = cutlass::fast_tanh(x.y);
+    y.z = cutlass::fast_tanh(x.z);
+    y.w = cutlass::fast_tanh(x.w);
+    return y;
+}
+{% endif %}
 """
 )
 
@@ -93,8 +116,17 @@ def gen_function_decl(func_attrs):
 
 @registry.reg("cuda.concatenate_tanh.gen_function")
 def gen_function(func_attrs):
+    backend_spec = CUDASpec()
+    dtype = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"],
+    )
+
     return concatenate.gen_function(
-        func_attrs, element_func="fast_tanh", element_func_def=TANH_DEF.render()
+        func_attrs,
+        element_func="fast_tanh",
+        element_func_def=TANH_DEF.render(
+            dtype=dtype,
+        ),
     )
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/dynamic_slice.py b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
index cee387517..a948e4efd 100644
--- a/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
+++ b/python/aitemplate/backend/cuda/tensor/dynamic_slice.py
@@ -16,9 +16,9 @@
 Dynamic slice CUDA implementation.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("cuda.dynamic_slice.func_decl")
diff --git a/python/aitemplate/backend/cuda/tensor/expand.py b/python/aitemplate/backend/cuda/tensor/expand.py
index cdd50a486..73d044aa4 100644
--- a/python/aitemplate/backend/cuda/tensor/expand.py
+++ b/python/aitemplate/backend/cuda/tensor/expand.py
@@ -13,19 +13,295 @@
 #  limitations under the License.
 #
 
-from ... import registry
+"""
+expand op general CUDA implementation with complete dynamic shape support
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda.tensor import expand_static_shape  # noqa: F401
 
 
 @registry.reg("cuda.expand.func_decl")
-def gen_function_decl(func_attrs):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
+        func = registry.get("cuda.expand.static.func_decl")
+        return func(func_attrs)
+    x = func_attrs["inputs"][0]
+    func_name = func_attrs["name"]
+    cuda_spec: CUDASpec = CUDASpec()
+    index_type = cuda_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    dt = x.dtype()
+    dtype = cuda_spec.dtype_to_backend_dtype.get(dt, None)
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,  # name of the function
+        dtype=dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        index_type=index_type,
+    )
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* src,
+  const {{index_type}}* input_dims,
+  const {{index_type}} input_rank,
+  void* dst,
+  {{index_type}}* output_dims, // written to ( runtime shape inference )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types,
+  cudaStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "logging.h"
+
+using bfloat16 = __nv_bfloat16;
+
+{% if index_type=="int64_t" %}
+#define DIM_TYPE_ADD 0l
+#define DIM_TYPE_EXPAND 1l
+#define DIM_TYPE_KEEP 2l
+
+#define MAX_THREADS_PER_BLOCK 1024l
+#define MAX_BLOCKS 65535l
+#define MAX_X_BLOCKS 2147483647l
+{% else %}
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_BLOCKS 65535
+#define MAX_X_BLOCKS 2147483647
+{% endif %}
+
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+#define INT_MIN(a,b) ((a) < (b)? (a) : (b))
+
+/**
+ * Sequential write expand kernel.
+ * This kernel deals with the general case ( strided copy ).
+ * It relies heavily on L2 cache for scattered read optimization and
+ * writes sequentially.
+ */
+__global__ void {{func_name}}_sequential_write_kernel(
+
+  const {{dtype}}* src, // source tensor
+  {{dtype}}* dst, // destination tensor
+  const {{index_type}} dst_numel // number of elements in dst
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}} // Stride for writing dimension {{i}} to dst
+        ,const {{index_type}} read_strides_{{i}} // Stride for reading dimension {{i}} from src
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    {{index_type}} write_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    const {{index_type}} grid_stride = gridDim.x*blockDim.x;
+    for (;write_idx<dst_numel;write_idx += grid_stride) {
+      {{index_type}} read_idx = 0;
+      {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+      {% for i in range(output_rank) %}
+          read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+          remaining_idx %= output_strides_{{i}};
+      {% endfor %}
+      dst[write_idx] = src[read_idx];
+    }
+}
+
+/**
+ * Expand Operator entry point with support for dynamic shapes
+ */
+void {{func_name}} (
+  const void* src, // input tensor
+  const {{index_type}}* input_dims, // input dimensions ( passed by value )
+  const {{index_type}} input_rank,
+  void* dst, // output tensor
+  {{index_type}}* output_dims, // output dimensions ( passed by value )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types, // Output dim types ( length=output_rank ). 2 = keep dimension, 1 = expand dimension, 0 = add dimension
+  cudaStream_t stream)
+{
+  // Calculate number of input elements
+  {{index_type}} input_numel = 1;
+  {{index_type}} i;
+  for (i = 0; i < input_rank; ++i) {
+    input_numel *= input_dims[i];
+  }
+  if (input_numel==0) {
+    return;
+  }
+  {{index_type}} input_dim_pos = 0;
+
+  // Calculate number of output dimensions
+  {{index_type}} output_numel = 1;
+  for (i = 0; i < output_rank; ++i) {
+    output_numel *= output_dims[i];
+  }
+  if (output_numel==0) {
+    return;
+  }
+  // Determine stride for each input dimension
+  {{index_type}} input_strides[input_rank];
+  input_strides[input_rank-1] = 1;
+  for (i=input_rank-2;i>=0;--i) {
+    input_strides[i] = input_strides[i+1]*input_dims[i+1];
+  }
+  // Determine stride for each output dimension
+  {{index_type}} output_strides[output_rank];
+  output_strides[output_rank-1] = 1;
+  for (i=output_rank-2;i>=0;--i) {
+    output_strides[i] = output_strides[i+1]*(output_dims[i+1]);
+  }
+
+  // Determine read strides for each output dimension
+  // (0 for expand or add dims, otherwise the stride of
+  // of the corresponding input dim)
+  {{index_type}} read_strides[output_rank];
+
+  input_dim_pos = 0;
+  for (i = 0; i < output_rank; ++i) {
+    {{index_type}} dim_type =  output_dim_types[i];
+    if (dim_type == DIM_TYPE_KEEP ) { // keep
+      read_strides[i] = input_strides[input_dim_pos++];
+    } else {
+      read_strides[i] = 0;
+      if (dim_type==DIM_TYPE_EXPAND) {
+        input_dim_pos++;
+      }
+    }
+  }
+  assert(input_dim_pos==input_rank);
+
+  // Calculating tail dimension in order to determine whether we can do sequential batching
+  {{index_type}} tail_dim = 1;
+  for (i = output_rank-1; i >= 0; --i) {
+      if (output_dim_types[i]!=DIM_TYPE_KEEP) {
+         break;
+      }
+      tail_dim *= output_dims[i];
+  }
+
+  // determine CUDA kernel grid layout. Tuning numbers determined experimentally
+  {{index_type}} thread_size_x = INT_MIN(output_numel, MAX_THREADS_PER_BLOCK); // more threads per block maximize L1 cache utilization
+  {{index_type}} block_size_x = INT_MIN(INT_CEIL_DIV(output_numel, thread_size_x), 4096l ); //
+
+  // for very large dimensions, we rely on grid-stride loop and save the block launch overhead
+  dim3 dimGrid(block_size_x, 1, 1);
+  dim3 dimBlock(thread_size_x, 1, 1);
+  {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+      static_cast<const {{dtype}}*>(src),
+      static_cast<{{dtype}}*>(dst),
+      output_numel
+      {% for i in range(output_rank) %}
+        ,output_strides[{{i}}]
+        ,read_strides[{{i}}]
+      {% endfor %}
+  );
+}
+"""
+)
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    cuda_spec: CUDASpec = CUDASpec()
+    dtype = cuda_spec.dtype_to_backend_dtype.get(x.dtype(), None)
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    index_type = cuda_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    assert index_type is not None
+
+    input_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in xshape]
+    )
+    output_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in yshape]
+    )
+    input_rank = len(xshape)
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "input_dims": input_dims,  # list of input dimensions (as string of comma-separated variable names )
+        "output_dims": output_dims,  # output dimensions (as string of comma-separated variable names)
+        "input_rank": input_rank,  # number of input dimensions
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float ))
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+    }
 
 
 @registry.reg("cuda.expand.gen_function")
-def gen_function(func_attrs):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+    else:
+        func = registry.get("cuda.expand.static.gen_function")
+        return func(func_attrs)
 
 
 @registry.reg("cuda.expand.func_call")
-def gen_function_call(func_attrs, indent="  "):
-    raise NotImplementedError("Expand copying kernel is not implemented.")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+    else:
+        func = registry.get("cuda.expand.static.func_call")
+        return func(func_attrs, indent)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}const {{index_type}} input_dims[] = { {{input_dims}} };
+    {{indent}}{{index_type}} output_dims[] = { {{output_dims}} };
+    {{indent}}const {{index_type}} output_dim_types[] = { {{dim_types}} };
+    {{indent}}{{func_name}}(
+    {{indent}}    {{src}},
+    {{indent}}    input_dims,
+    {{indent}}    {{input_rank}},
+    {{indent}}    {{dst}},
+    {{indent}}    output_dims,
+    {{indent}}    {{output_rank}},
+    {{indent}}    output_dim_types,
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/cuda/tensor/expand_static_shape.py b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
new file mode 100644
index 000000000..b927100da
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/expand_static_shape.py
@@ -0,0 +1,386 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Specialized and optimized CUDA kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
+import math
+import os
+from itertools import accumulate
+from operator import mul
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+
+
+@registry.reg("cuda.expand.static.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  cudaStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <cuda_pipeline.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "logging.h"
+
+
+using bfloat16 = __nv_bfloat16;
+
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024l
+// integer ceil division
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+
+// Maximum amount of shared memory that the repeat copy kernel(s) should use.
+// (used within repeat.cuh, included below )
+// Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
+#define SHM_MAX 1024 * 44
+
+{{custom_libs}}
+
+/**
+ * Get read base offset (e.g. excluding tail offset) in the middle part, given a write offset
+ * into the middle part
+ */
+__forceinline__ __device__ {{index_type}} {{func_name}}_get_read_offset(const {{index_type}} write_offset) {
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_write_idx = write_offset; // assert < {{mid_size*tail_size}} ( i.e. < mid_size*tail_size)
+    {% for i in range(head_dim_count, head_dim_count+mid_dim_count-1) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+        remaining_write_idx %= {{output_strides[i]}}l;
+    {% endfor %}
+    {% for i in range(head_dim_count+mid_dim_count-1, head_dim_count+mid_dim_count) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+    {% endfor %}
+    return read_idx;
+}
+
+/**
+ *  Copies tail elements from a contiguous source memory region into a contiguous target memory region
+ *  Using a grid-stride loop and the vectorized dtype
+ *
+ * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__forceinline__ __device__ void tail_copy(
+        const {{dtype}} * const src, // base src tensor memory pointer
+        const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
+        {{dtype}} * const dst,  // base destination tensor memory pointer
+        const {{index_type}} write_offset, // Base offset into dst via {{dtype}}-typed indexing
+        const {{index_type}} block_thread_index,
+        const {{index_type}} block_thread_count,
+        const {{index_type}} copy_numel
+    ) {
+    for ({{index_type}} i=block_thread_index;i<copy_numel;i+=block_thread_count) {
+        dst[write_offset+i] = src[read_offset+i];
+    }
+}
+
+/**
+ * Implement the "middle" part of the kernel, dealing with strided reads/writes.
+ * Also utilizes grid-stride loop for efficiency and flexibility
+ * see
+ * * https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * * https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#coalesced-access-to-global-memory
+ * * and https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#strided-accesses
+ * for a more detailed explanation of the reasons for the choice of this specific form.
+ *
+ * Performance notes:
+ *
+ * It is critical to calculate the block_thread_index passed to tail_copy(..) based on
+ * the x-dimension of the launch grid, in order to benefit from Warp memory access coalescing.
+ *
+ */
+__global__ void expand_strided_copy(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.y * blockIdx.y + threadIdx.y) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
+    const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
+        tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+    }
+
+}
+
+/**
+ * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
+ */
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  cudaStream_t stream)
+{
+  if ((({{mid_size*tail_size}})==0) || (head_size==0)) {
+    return;
+  }
+  {% if mid_dim_count>0 %}
+  // we have middle dimensions which involve non-contiguous reads
+  // so we need to invoke the middle kernel
+  dim3 dimGrid({{grid_blocks_x}}, {{grid_blocks_y}});
+  dim3 dimBlock({{grid_threads_x}}, {{grid_threads_y}});
+  expand_strided_copy<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  if (head_size>1l) {
+     // now repeat copy what we already built once, multiple times into the rest of the output tensor
+     cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
+  }
+  {% else %}
+    // we have no middle dimensions, so strided copy is unneccessary.
+    // All we need to do is repeatedly copy the source multiple times
+    // repeat the entire thing a dynamic number of times ( e.g. head_size times )
+    cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
+  {% endif %}
+}
+"""
+)
+
+
+def _ceil(num: float) -> int:
+    return int(math.ceil(num))
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    # Efficient vectorized & buffered repeat copy implementation,
+    # even for odd shapes
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "repeat.cuh"
+    )
+    cuda_spec = CUDASpec()
+    dtype = cuda_spec.dtype_to_backend_dtype[x.dtype()]
+    assert (
+        dtype is not None
+    ), f"CUDA implementation does not support dtype {x.dtype()} (yet)"
+    dtype2 = cuda_spec.type_for_size.get(cuda_spec.sizeof_types[dtype] * 2, None)
+    dtype4 = cuda_spec.type_for_size.get(cuda_spec.sizeof_types[dtype] * 4, None)
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
+    index_type = "int64_t"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in xshape
+    ), "All input shapes need to be fixed"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in yshape
+    ), "All output shapes need to be fixed"
+
+    # Calculate number of times we can repeatedly copy the entire result, based on how many add, expand and singleton dimensions
+    # we have at the start
+    head_size_lower = 1  # Number of times we can batch-repeat the entire result in an efficient batch-copying manner
+    head_size_upper = 1
+    head_dim_count = 0  # Number of head dimensions
+
+    for dim_type, dim in zip(func_attrs["dim_types"], yshape):
+        if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        head_size_lower *= dim.lower_bound()
+        head_size_upper *= dim.upper_bound()
+        head_dim_count += 1
+
+    # Create a symbolic term for calculating head size ( e.g. repeat count )
+    if head_size_lower == head_size_upper:
+        head_size_symbolic = f"{head_size_upper}l"
+    else:
+        head_size_symbolic = "*".join(
+            [
+                f"static_cast<{index_type}>(" + dim._attrs["name"] + ")"
+                for dim in yshape[:head_dim_count]
+            ]
+        )
+
+    # Calculate number of tail elements, e.g. number of elements we can batch-copy in the inner loop
+    # via effective sequential reads & writes
+    tail_dim_count = 0  # number of tail dimensions
+    tail_size = 1  # Number of the elements in all these  tail dimensions
+    for dim_type, dim in reversed(
+        list(zip(dim_types[head_dim_count:], yshape[head_dim_count:]))
+    ):
+        if dim_type != ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        tail_dim_count += 1
+        tail_size *= dim.lower_bound()
+
+    input_strides = list(
+        reversed(
+            list(accumulate([1] + [d.lower_bound() for d in reversed(xshape)], mul))
+        )
+    )
+    output_strides = list(
+        reversed(
+            list(
+                accumulate(
+                    [1] + [d.lower_bound() for d in reversed(yshape[head_dim_count:])],
+                    mul,
+                )
+            )
+        )
+    )
+
+    output_numel = output_strides[
+        0
+    ]  # this does not include the number of elements obtained from head repetitions
+    # since we have excluded head dimensions above
+    input_numel = input_strides[0]
+    if tail_size > 0:
+        mid_size = output_numel // tail_size
+    else:
+        mid_size = 0
+    mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
+    if input_numel > 0:
+        mid_expansion_rate = mid_size * tail_size // input_numel
+    else:
+        mid_expansion_rate = 1
+
+    # remove the first dimension, which is the total number of elements
+    # and prepend the head_dims with stride 0
+    output_strides = [0] * head_dim_count + output_strides[1:]
+    input_strides = input_strides[1:]
+
+    input_stride_pos = 0
+    read_strides = [0] * len(yshape)
+    for i in range(len(yshape)):
+        if dim_types[i] == ExpandDimensionType.ADD_DIM:
+            continue
+        if dim_types[i] == ExpandDimensionType.KEEP_DIM:
+            read_strides[i] = input_strides[input_stride_pos]
+        # For keep dim, read stride remains at zero
+        input_stride_pos += 1
+
+    assert input_stride_pos == len(
+        xshape
+    ), "Incorrect number of keep and expand dims. Something went wrong."
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+
+    # If tail size is aligned to 2 or 4 elements, we can vectorize reads/writes
+    # Note: Further vectorization not easily possible, given that it could happen that
+    # the read offset and the write offset can get different alignments within the expand op
+    #
+    if (tail_size % 4 == 0) and (dtype4 is not None):
+        dtype = dtype4
+        tail_size = tail_size // 4
+        output_strides = [s // 4 for s in output_strides]
+        read_strides = [s // 4 for s in read_strides]
+    elif tail_size % 2 == 0:
+        dtype = dtype2
+        tail_size = tail_size // 2
+        output_strides = [s // 2 for s in output_strides]
+        read_strides = [s // 2 for s in read_strides]
+
+    grid_blocks_x = 1
+    grid_threads_x = max(1, min(tail_size, 32))
+    max_y_threads = 1024 // grid_threads_x  # guaranteed to be >= 1
+    grid_threads_y = max(
+        1, min(max_y_threads, mid_size)
+    )  # so that  mid_grid_threads_x*max_x_threads <= 1024
+    grid_blocks_y = _ceil(mid_size / grid_threads_y)
+
+    if dtype == "bfloat16":
+        # bfloat16 is not available in model-generated.h as a type,
+        # so we can either just declare the input to be void*
+        # or  just use the fact that we don't care about how to interpret the value
+        # and just treat it like every other 16 bit type.
+        dtype = "half"
+
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "output_strides": output_strides,  # list of output stride values
+        "read_strides": read_strides,  # list of read stride values
+        "tail_dim_count": tail_dim_count,  # number of tail dimensions
+        "tail_size": tail_size,  # number of elements in all these tail dimensions
+        "head_dim_count": head_dim_count,  # number of head dimensions
+        "head_size": head_size_symbolic,  # number of elements in all these head dimensions
+        "mid_dim_count": mid_dim_count,
+        "mid_size": mid_size,
+        "mid_expansion_rate": mid_expansion_rate,  # How many times do we read the input for the middle
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid CUDA C type like float )
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+        "grid_blocks_y": grid_blocks_y,  # number of y grid blocks in the strided copy kernel
+        "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
+        "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
+        "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.cuh
+    }
+
+
+@registry.reg("cuda.expand.static.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+
+
+@registry.reg("cuda.expand.static.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}{{func_name}}(
+    {{indent}}    static_cast<{{dtype}}*>({{src}}),
+    {{indent}}    static_cast<{{dtype}}*>({{dst}}),
+    {{indent}}    {{head_size}},
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/cuda/tensor/full.py b/python/aitemplate/backend/cuda/tensor/full.py
new file mode 100644
index 000000000..73c5bbaa9
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/full.py
@@ -0,0 +1,150 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void*,  /* output */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void full(
+    {{read_type}}* output,
+    {{index_type}} num_elements
+) {
+  const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx * N_ELEMENTS_PER_THREAD >= num_elements) {
+    return;
+  }
+
+  {{read_type}} tmp;
+  {{data_type}}* p = reinterpret_cast<{{data_type}}*>(&tmp);
+
+  #pragma unroll
+  for (int i=0; i < N_ELEMENTS_PER_THREAD; i++) {
+      p[i] = ({{data_type}}) ({{fill_value}});
+  }
+
+  output[idx] = tmp;
+}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    {{prefix}}Stream_t stream
+){
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>({{num_elements}}) / N_ELEMENTS_PER_THREAD / N_THREADS_PER_BLOCK));
+    full<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(reinterpret_cast<{{read_type}}*> (output), {{num_elements}});
+}
+    """
+)
+
+
+@registry.reg("cuda.full.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+
+    # fill the maximum output Tensor size with the fill_value
+    # any shape within the maximum bounds will be a subset
+    num_elements = 1
+    for dim in y.shape():
+        num_elements *= dim.upper_bound()
+
+    dtype = y.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(
+            read_t=read_type,
+            data_t=data_type,
+        ),
+        func_name=func_attrs["name"],
+        read_type=read_type,
+        data_type=data_type,
+        index_type=backend_spec.index_type,
+        fill_value=func_attrs["fill_value"],
+        num_elements=num_elements,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.full.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("cuda.full.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/gather.py b/python/aitemplate/backend/cuda/tensor/gather.py
index 22fdaf8d0..0841dcb18 100644
--- a/python/aitemplate/backend/cuda/tensor/gather.py
+++ b/python/aitemplate/backend/cuda/tensor/gather.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from .. import cuda_common
+from aitemplate.backend import registry
+from aitemplate.backend.cuda import cuda_common
 
 CAST_TO_CONST_INDEX_PTR_TEMPLATE = jinja2.Template(
     "reinterpret_cast<const {{index_type}}*>({{name}})"
@@ -181,6 +181,7 @@
       (num_output_elems / (ThreadsPerBlock * ElemsPerThread)) + m;
   int grid_config = num_blocks_x;
 
+{% if elem_type == "half" %}
   if (num_output_elems % 2 == 0) {
     gather_kernel<float, int4, ELEM_T, Rank, ElemsPerThread>
     <<<grid_config, ThreadsPerBlock, 0, stream>>>(
@@ -202,6 +203,17 @@
         num_output_elems);
     CUDA_LAUNCH_CHECK_GATHER();
   }
+{% elif elem_type == "float" %}
+  gather_kernel<float, INDEX_TYPE, ELEM_T, Rank, ElemsPerThread>
+  <<<grid_config, ThreadsPerBlock, 0, stream>>>(
+      output,
+      input,
+      indices,
+      input_meta,
+      gather_dim,
+      num_output_elems);
+  CUDA_LAUNCH_CHECK_GATHER();
+{% endif %}
 }
 
 #undef CUDA_CHECK_ERROR_GATHER
@@ -349,8 +361,10 @@ def gen_function(func_attrs):
         elems_per_thread=2,
         threads_per_block=128,
     )
-
-    kernel_src = KERNEL_SRC_TEMPLATE.render(index_type=index_type)
+    kernel_src = KERNEL_SRC_TEMPLATE.render(
+        index_type=index_type,
+        elem_type=input_type,
+    )
     return SRC_TEMPLATE.render(
         kernel_src=kernel_src,
         func_name=func_attrs["name"],
diff --git a/python/aitemplate/backend/cuda/tensor/identity.py b/python/aitemplate/backend/cuda/tensor/identity.py
new file mode 100644
index 000000000..08c70b346
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/identity.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA identity function
+"""
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import identity_common
+
+
+@registry.reg("cuda.identity.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return identity_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
+
+
+@registry.reg("cuda.identity.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return identity_common.gen_function(func_attrs=func_attrs, backend_spec=CUDASpec())
+
+
+@registry.reg("cuda.identity.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return identity_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=CUDASpec()
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/index_select.py b/python/aitemplate/backend/cuda/tensor/index_select.py
new file mode 100644
index 000000000..6e43764e1
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/index_select.py
@@ -0,0 +1,253 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define index_select codegen and CUDA kernel
+
+Example input:
+ - tensor of shape (6,5,4,3,2)
+ - dim = 2 (0->6, 1->5, 2->4, 3->3, 4->2)
+ - dim_len = 4
+ - dim_idxs = [1,2] (numbers taken from interval [0,3])
+ - dim_idx_len = 2
+ - num_before = 6*5
+ - num_after = 3*2
+
+Output tensor has dim (6,5,2,3,2) i.e.
+it has 6*5 (num_before) sets of 2 (dim_idx_len) sets of  3*2 (num_after) elements.
+
+Assuming contiguous memory layout of the original tensor (which seems like a base check for bad_tensor),
+the first few elements to be selected are at positions [6-11], [12-17] corresponding to dim_idxs values 1 and 2.
+Generalized to:
+    - Divide global thread_idx by num_after and calculate start of innermost set as the remainder
+    - Further divide by dim_idx_len and calculate start of next outer set as the remainder
+    - Use the final value as the offset for the outer most set
+    - Compute offset and assign to the element denoted by thread idx
+    - increment idx by grid stride
+
+Num threads = 256.
+Blocks are(N + threads - 1) / threads;
+
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda import cuda_common
+
+
+header_files = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const {{index_type}} /*dim*/,
+    const {{index_type}} /*dim_len*/,
+    const {{index_type}}* /*dim_idxs*/,
+    const {{index_type}} /*dim_idxs_len*/,
+    const {{index_type}} /*num_before*/,
+    const {{index_type}} /*num_after*/,
+    cudaStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+__global__ void index_select_kernel(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    const {{index_type}} N
+) {
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+    #pragma unroll
+    for(auto i = idx; i<N; i+=gridDim.x*blockDim.x) {
+        auto res = i;
+        auto k = i%num_after;
+        res = res/num_after;
+        auto j = res%dim_idxs_len;
+        res = res/dim_idxs_len;
+        auto skip = res*dim_len*num_after + (dim_idxs[j]*num_after) + k;
+        output[i] = input[skip];
+    }
+
+}
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    cudaStream_t stream
+    ) {
+
+    {{index_type}} N =  num_before*dim_idxs_len*num_after;
+    const {{index_type}} threads  = 256;
+    auto blocks = (N + threads - 1) / threads;
+
+    index_select_kernel<<<blocks, threads, 0, stream>>>(output, input, dim, dim_len, dim_idxs,
+        dim_idxs_len, num_before, num_after, N);
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{index_type}} x_dims[] = {
+{{indent}}      {{x_dims}}
+{{indent}}  };
+{{indent}}  {{index_type}} num_before = 1;
+{{indent}}  {{index_type}} num_after = 1;
+{{indent}}  {{index_type}} dim_len = x_dims[{{dim}}];
+{{indent}}  for(auto i=0;i<{{dim}};i++) {
+{{indent}}   num_before *= x_dims[i];
+{{indent}}  }
+{{indent}}  for(auto i={{dim}}+1;i<sizeof(x_dims)/sizeof(x_dims[0]);i++) {
+{{indent}}   num_after *= x_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output}},
+{{indent}}      {{input}},
+{{indent}}      {{dim}},
+{{indent}}      dim_len,
+{{indent}}      {{dim_idxs}},
+{{indent}}      {{dim_idxs_len}},
+{{indent}}      num_before,
+{{indent}}      num_after,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("cuda.index_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+    )
+
+
+@registry.reg("cuda.index_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.index_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    dim_idxs = func_attrs["inputs"][1]
+    y = func_attrs["outputs"][0]
+    dim = func_attrs["dim"]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    dim_idxs_ptr = backend_spec.cast_to_ptr_template.render(
+        name=dim_idxs._attrs["name"],
+        dtype=backend_spec.index_type,
+    )
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+
+    x_dims = ", ".join(dim._attrs["name"] for dim in x._attrs["shape"])
+    dim_idxs_len = dim_idxs._attrs["shape"][0]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        index_type=backend_spec.index_type,
+        x_dims=x_dims,
+        input_type=dtype,
+        func_name=func_attrs["name"],
+        output=output_ptr,
+        input=input_ptr,
+        dim=dim,
+        dim_idxs=dim_idxs_ptr,
+        dim_idxs_len=dim_idxs_len,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py b/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py
new file mode 100644
index 000000000..bdafac544
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/jagged_to_padded_dense.py
@@ -0,0 +1,284 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+The back-end bindings of the jagged_to_padded_dense op.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import (
+    CONSTANT_TEMPLATE,
+    gen_dynamic_dim_str,
+    gen_int_var_product_str,
+    gen_offsets_str,
+    get_dynamic_dims,
+    get_stride_expressions,
+    KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE,
+)
+from aitemplate.utils import shape_utils
+
+
+KERNEL_PADDING_TEMPLATE = jinja2.Template(
+    """
+        {{read_t}} padded_vector;
+        {{data_t}}* cursor = reinterpret_cast<{{data_t}}*>(&padded_vector);
+
+        #pragma unroll
+        for (int i = 0; i < N_ELEMENTS_PER_THREAD; i++) {
+            cursor[i] = {{data_t}}({{padding_value}});
+        }
+
+        y[dense_idx] = padded_vector;
+    """
+)
+
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}({{read_t}}* y, const {{read_t}}* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements) {
+  {{compute_idx}}
+
+  y[dense_idx] = x[jagged_idx];
+}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims_decl}} {{offsets_decl}} {{index_type}} n_elements, {{prefix}}Stream_t stream) {
+    if (n_elements == 0) {
+      return;
+    }
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        reinterpret_cast<{{read_t}}*>(y),
+        reinterpret_cast<const {{read_t}}*>(x),
+        {{dynamic_dims_call}}
+        {{offsets_call}}
+        n_elements
+    );
+}
+    """
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(void* y, const void* x, {{dynamic_dims}} {{offsets}} {{index_type}} n_elements, {{prefix}}Stream_t stream);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} {{func_name}}_n_elements = {{calculate_n}};
+    {{indent}}invoke_{{func_name}}({{y}}, {{x}}, {{dynamic_dims}} {{offsets}} {{func_name}}_n_elements, {{stream}});
+{{indent}}}
+    """
+)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+    read_type: str,
+) -> str:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    padding_value = func_attrs["padding_value"]
+    jagged_int_var = x.shape()[0]
+    num_offsets = len(jagged_int_var.jagged_dims())
+    backend_spec = CUDASpec()
+
+    padding_str = KERNEL_PADDING_TEMPLATE.render(
+        data_t=data_type,
+        read_t=read_type,
+        padding_value=padding_value,
+    )
+
+    compute_idx_str = KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE.render(
+        index_type=index_type,
+        num_offsets=num_offsets,
+        strides=get_stride_expressions(y.shape()),
+        offsets_type=jagged_int_var.offsets_type(),
+        out_of_bounds_action=padding_str,
+    )
+
+    return KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        read_t=read_type,
+        compute_idx=compute_idx_str,
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=True,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.gen_function")
+def jagged_to_padded_dense_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates jagged_to_padded_dense function definition."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    backend_spec = CUDASpec()
+
+    dtype = x.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+
+    # inner size of the input jagged Tensor: can't use the output dense Tensor
+    # shape here, as some the dimensions in it may overlap with the jagged
+    # dimensions of the input jagged Tensor
+    inner_size = shape_utils.get_num_rightmost_static_elements(x.shape())
+    read_type = backend_spec.get_elementwise_read_backend_type(inner_size, dtype)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs=func_attrs,
+        index_type=backend_spec.index_type,
+        data_type=data_type,
+        read_type=read_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=read_type,
+        data_t=data_type,
+        op_t=data_type,
+    )
+
+    dynamic_dims = get_dynamic_dims(y.shape())
+
+    return FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_attrs["name"],
+        dynamic_dims_decl=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=False,
+        ),
+        offsets_decl=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by const reference to the function
+            const_ref=True,
+            name="offsets",
+        ),
+        offsets_call=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=False,
+            const_ref=False,
+            name="offsets",
+        ),
+        read_t=read_type,
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.func_decl")
+def jagged_to_padded_dense_gen_function_decl(func_attrs) -> str:
+    """Generate jagged_to_padded_dense function declaration."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=True,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            const_ref=True,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.jagged_to_padded_dense.func_call")
+def jagged_to_padded_dense_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate jagged_to_padded_dense function call."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = x.shape()[0]
+    backend_spec = CUDASpec()
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        index_type=backend_spec.index_type,
+        calculate_n=gen_int_var_product_str(y.shape()),
+        y=y._attrs["name"],
+        x=x._attrs["name"],
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(y.shape()),
+            has_type=False,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=False,
+            const_ref=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/masked_select.py b/python/aitemplate/backend/cuda/tensor/masked_select.py
new file mode 100644
index 000000000..abf053431
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/masked_select.py
@@ -0,0 +1,243 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select codegen and CUDA kernel
+"""
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.cuda import cuda_common
+
+
+header_files = """
+#include <cuda_fp16.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include <cub/cub.cuh>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const bool* /*mask*/,
+    {{index_type}} /*num_elems*/,
+    {{index_type}}* /*output size*/,
+    void* workspace /*workspace*/,
+    cudaStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+#ifndef CUDA_CHECK_MASKED_SELECT
+#define CUDA_CHECK_MASKED_SELECT(expr, msg)                   \\
+  do {                                                        \\
+    cudaError_t status = (expr);                              \\
+    if (status != cudaSuccess) {                              \\
+        std::cerr << msg << " at " << __FILE__                \\
+                  << ": " << __LINE__ << std::endl;           \\
+        throw std::runtime_error(cudaGetErrorString(status)); \\
+    }                                                         \\
+  } while (0)
+#endif // CUDA_CHECK_MASKED_SELECT
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const bool* mask,
+    {{index_type}} num_elems,
+    {{index_type}}* num_nonmasked,
+    void* workspace,
+    cudaStream_t stream
+    ) {
+
+    // Make sure input, output, mask, and workspace are valid
+    if (!input) {
+        throw std::runtime_error("input is NULL!");
+    }
+    if (!output) {
+        throw std::runtime_error("output is NULL!");
+    }
+    if (!mask) {
+        throw std::runtime_error("mask is NULL!");
+    }
+    if (!workspace) {
+        throw std::runtime_error("workspace is NULL!");
+    }
+    size_t allocated_storage = {{workspace_size}};
+
+    // Keep the number of nonmasked elements at the beginning of the workspace
+    const size_t NUM_NONMASKED_SIZE = sizeof({{index_type}});
+    {{index_type}}* num_nonmasked_device = static_cast<{{index_type}}*>(workspace);
+
+    // Get needed temporary storage size and reallocate if necessary
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    CUDA_CHECK_MASKED_SELECT(cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, input, mask, output, num_nonmasked_device, num_elems, stream),
+                             "Error when checking the required buffer size!");
+    CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
+
+    if (allocated_storage < temp_storage_bytes + NUM_NONMASKED_SIZE) {
+        auto msg = "Got pre-allocated buffer of size " + std::to_string(allocated_storage) + ", but need " + std::to_string(temp_storage_bytes)
+                + ". Allocating a new buffer, expect performance degradation.";
+        std::cerr << msg << std::endl;
+        // Allocate temporary storage
+        temp_storage_bytes += NUM_NONMASKED_SIZE;
+        CUDA_CHECK_MASKED_SELECT(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream), "Error when trying to allocate a new buffer!");
+        CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
+        workspace = d_temp_storage;
+        allocated_storage = temp_storage_bytes;
+    }
+    allocated_storage -= NUM_NONMASKED_SIZE;  // First NUM_NONMASKED_SIZE bytes are reserved
+
+    // Select nonmasked elements. First NUM_NONMASKED_SIZE bytes of workspace are reserved for num_nonmasked_device
+    CUDA_CHECK_MASKED_SELECT(cub::DeviceSelect::Flagged(workspace + NUM_NONMASKED_SIZE, allocated_storage, input, mask, output,
+        num_nonmasked_device, num_elems, stream),  "Error when selecting nonmasked elements!");
+
+    // Extract number of nonmasked elements (size of the output)
+    CUDA_CHECK_MASKED_SELECT(cudaMemcpyAsync(num_nonmasked, num_nonmasked_device, NUM_NONMASKED_SIZE, cudaMemcpyDeviceToHost, stream),
+                             "Error when copying the number of nonmasked elements from device to host!");
+    CUDA_CHECK_MASKED_SELECT(cudaStreamSynchronize(stream), "Error when synchronizing the stream!");
+
+    if (d_temp_storage != nullptr) {
+        CUDA_CHECK_MASKED_SELECT(cudaFreeAsync(d_temp_storage, stream), "Error when freeing GPU memory allocated by masked_select!");
+    }
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}
+{{indent}}  const {{index_type}} input_dims[] = {{input_dims}};
+{{indent}}  int64_t num_elems = 1;
+{{indent}}  for ({{index_type}} i = 0; i < {{rank}}; i++) {
+{{indent}}        num_elems *= input_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output_ptr}},
+{{indent}}      {{input_ptr}},
+{{indent}}      {{mask_ptr}},
+{{indent}}      num_elems,
+{{indent}}      {{num_nonmasked}},
+{{indent}}      global_workspace_,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("cuda.masked_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    output_type = cuda_common.dtype_to_cuda_type(y._attrs["dtype"])
+
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+        workspace_size=func_attrs["workspace"],
+    )
+
+
+@registry.reg("cuda.masked_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = CUDASpec()
+    x = func_attrs["inputs"][0]
+    input_type = cuda_common.dtype_to_cuda_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.masked_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = CUDASpec()
+    x, mask = func_attrs["inputs"]
+    y = func_attrs["outputs"][0]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+    mask_ptr = backend_spec.cast_to_ptr_template.render(
+        name=mask._attrs["name"],
+        dtype="bool",
+    )
+    # Number of nonmasked elements, i.e. size of the output
+    num_nonmasked_ptr = "&" + y._attrs["shape"][0]._attrs["name"]
+    input_dims = "{" + ",".join([dim._attrs["name"] for dim in x._attrs["shape"]]) + "}"
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        func_name=func_attrs["name"],
+        input_name=x._attrs["name"],
+        num_nonmasked=num_nonmasked_ptr,
+        input_dims=input_dims,
+        rank=len(x._attrs["shape"]),
+        output_ptr=output_ptr,
+        input_ptr=input_ptr,
+        mask_ptr=mask_ptr,
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py b/python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py
new file mode 100644
index 000000000..73d4e9e19
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/padded_dense_to_jagged.py
@@ -0,0 +1,339 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+The back-end bindings of the padded_dense_to_jagged op.
+"""
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import (
+    CONSTANT_TEMPLATE,
+    gen_dynamic_dim_str,
+    gen_int_var_product_str,
+    gen_offsets_str,
+    get_dynamic_dims,
+    get_stride_expressions,
+    KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE,
+    KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE,
+)
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar, JaggedIntVar
+from aitemplate.utils import shape_utils
+
+
+KERNEL_TEMPLATE = jinja2.Template(
+    """
+__global__ void {{func_name}}(
+    {{read_t}}* y,
+    const {{read_t}}* x,
+    {{dynamic_dims}}
+    {{offsets}}
+    {{index_type}} n_elements
+) {
+  {{compute_idx}}
+
+  y[jagged_idx] = x[dense_idx];
+}
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{head}}
+
+#include <iostream>
+#include "jagged.h"
+
+namespace {
+
+{{constant}}
+
+{{kernel_function}}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* y,
+    const void* x,
+{% for idx in range(num_offsets) %}
+    {{index_type}} offsets_length_{{idx}},
+    const void* offsets_data_{{idx}},
+{% endfor %}
+    {{dynamic_dims_decl}}
+    {{prefix}}Stream_t stream
+) {
+    {{index_type}} n_elements = {{calculate_n}};
+    if (n_elements == 0) {
+      return;
+    }
+
+    // we define local offsets here, because the resulting jagged Tensor's offsets
+    // haven't been initialized by make_jagged yet, which is invoked after this op
+    {{offsets_struct_type}} local_offsets;
+{% for idx in range(num_offsets) %}
+    local_offsets.lengths[{{idx}}] = offsets_length_{{idx}};
+    local_offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
+{% endfor %}
+
+    int block_size = static_cast<int>(std::ceil(static_cast<double>(n_elements) / N_ELEMENTS_PER_THREAD / FUSED_ELE_THREAD_SIZE));
+    {{func_name}}<<<block_size, FUSED_ELE_THREAD_SIZE, 0, stream>>>(
+        reinterpret_cast<{{read_t}}*>(y),
+        reinterpret_cast<const {{read_t}}*>(x),
+        {{dynamic_dims_call}}
+        local_offsets,
+        n_elements
+    );
+}
+    """
+)
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void* y,
+    const void* x,
+{% for idx in range(num_offsets) %}
+    {{index_type}},
+    const void*,
+{% endfor %}
+    {{dynamic_dims}}
+    {{prefix}}Stream_t stream
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{y}},
+{{indent}}    {{x}},
+{% for idx in range(num_offsets) %}
+{{indent}}    {{offsets_first_dim_names[idx]}},
+{{indent}}    {{offsets_data_names[idx]}},
+{% endfor %}
+{{indent}}    {{dynamic_dims}}
+{{indent}}    {{stream}}
+{{indent}});
+    """
+)
+
+
+def _gen_compute_idx_str(
+    input_shape: List[IntVar],
+    output_shape: List[IntVar],
+    index_type: str,
+    jagged_int_var: JaggedIntVar,
+) -> str:
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+    compute_idx_template = (
+        KERNEL_COMPUTE_JAGGED_IDX_THEN_DENSE_IDX_TEMPLATE
+        if use_jagged_space_indexing
+        else KERNEL_COMPUTE_DENSE_IDX_THEN_JAGGED_IDX_TEMPLATE
+    )
+
+    return compute_idx_template.render(
+        index_type=index_type,
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        strides=get_stride_expressions(input_shape),
+        offsets_type=jagged_int_var.offsets_type(),
+    )
+
+
+def _gen_calculate_n(
+    input_shape: List[IntVar],
+    output_shape: List[IntVar],
+) -> str:
+    use_jagged_space_indexing = Target.current()._kwargs.get(
+        "use_jagged_space_indexing", False
+    )
+    # we use jagged output's volume in case of the jagged space indexing
+    # and dense input's volume in case of the dense space indexing
+    index_space = output_shape if use_jagged_space_indexing else input_shape
+
+    return gen_int_var_product_str(index_space)
+
+
+def _gen_kernel_function(
+    func_attrs: Dict[str, Any],
+    index_type: str,
+    data_type: str,
+    read_type: str,
+) -> str:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    return KERNEL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        index_type=index_type,
+        compute_idx=_gen_compute_idx_str(
+            input_shape=x.shape(),
+            output_shape=y.shape(),
+            index_type=index_type,
+            jagged_int_var=jagged_int_var,
+        ),
+        read_t=read_type,
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
+            has_type=True,
+        ),
+        offsets=gen_offsets_str(
+            jagged_int_var=jagged_int_var,
+            has_type=True,
+            # the offsets are passed
+            # by value to the kernel
+            const_ref=False,
+            name="offsets",
+        ),
+    )
+
+
+@registry.reg("cuda.padded_dense_to_jagged.gen_function")
+def padded_dense_to_jagged_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """Generates padded_dense_to_jagged function definition."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    dtype = x.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+
+    # inner size of the output jagged Tensor: can't use the input dense Tensor
+    # shape here, as some the dimensions in it may overlap with the jagged
+    # dimensions of the output jagged Tensor
+    inner_size = shape_utils.get_num_rightmost_static_elements(y.shape())
+    read_type = backend_spec.get_elementwise_read_backend_type(inner_size, dtype)
+
+    kernel_function = _gen_kernel_function(
+        func_attrs=func_attrs,
+        index_type=backend_spec.index_type,
+        data_type=data_type,
+        read_type=read_type,
+    )
+
+    constant = CONSTANT_TEMPLATE.render(
+        read_t=read_type,
+        data_t=data_type,
+        op_t=data_type,
+    )
+
+    func_name = func_attrs["name"]
+    dynamic_dims = get_dynamic_dims(x.shape(), y.shape())
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+    total_length = jagged_int_var.total_length()
+
+    if total_length._attrs.get("isolated", False):
+        raise ValueError(
+            f"The {total_length._attrs['name']} (total_length) dimension "
+            f"of the jagged Tensor output of {func_name} must be present in "
+            "one of the input shapes, but it isn't."
+        )
+
+    return FUNC_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        head=backend_spec.header_src_template.render(),
+        offsets_struct_type=offsets_struct_type,
+        offsets_type=jagged_int_var.offsets_type(),
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        constant=constant,
+        kernel_function=kernel_function,
+        func_name=func_name,
+        calculate_n=_gen_calculate_n(
+            input_shape=x.shape(),
+            output_shape=y.shape(),
+        ),
+        dynamic_dims_decl=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=True,
+        ),
+        dynamic_dims_call=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=dynamic_dims,
+            has_type=False,
+        ),
+        read_t=read_type,
+    )
+
+
+@registry.reg("cuda.padded_dense_to_jagged.func_decl")
+def padded_dense_to_jagged_gen_function_decl(func_attrs) -> str:
+    """Generate padded_dense_to_jagged function declaration."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    func_name = func_attrs["name"]
+    backend_spec = CUDASpec()
+
+    return FUNC_DECL_TEMPLATE.render(
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        func_name=func_name,
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
+            has_type=True,
+        ),
+    )
+
+
+@registry.reg("cuda.padded_dense_to_jagged.func_call")
+def padded_dense_to_jagged_gen_function_call(
+    func_attrs,
+    indent: str,
+) -> str:
+    """Generate padded_dense_to_jagged function call."""
+
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    jagged_int_var = func_attrs["jagged_int_var"]
+    backend_spec = CUDASpec()
+
+    offsets_list = func_attrs["inputs"][1:]
+    offsets_first_dim_names = [
+        offsets._attrs["shape"][0]._attrs["name"] for offsets in offsets_list
+    ]
+    offsets_data_names = [offsets._attrs["name"] for offsets in offsets_list]
+
+    return FUNC_CALL_TEMPLATE.render(
+        stream=backend_spec.stream,
+        func_name=func_attrs["name"],
+        num_offsets=len(jagged_int_var.jagged_dims()),
+        offsets_first_dim_names=offsets_first_dim_names,
+        offsets_data_names=offsets_data_names,
+        y=y._attrs["name"],
+        x=x._attrs["name"],
+        dynamic_dims=gen_dynamic_dim_str(
+            index_type=backend_spec.index_type,
+            dynamic_dims=get_dynamic_dims(x.shape(), y.shape()),
+            has_type=False,
+        ),
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/permute.cuh b/python/aitemplate/backend/cuda/tensor/permute.cuh
index b9ced62b7..759f601fa 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.cuh
+++ b/python/aitemplate/backend/cuda/tensor/permute.cuh
@@ -315,10 +315,11 @@ void DispatchIndexType(
     const void* src,
     const int* permutation,
     void* dst,
+    size_t elem_size,
     cudaStream_t stream) {
   // Vector read/write.
   // This fixed a bug in the original oneflow code.
-  src_dims[num_dims - 1] = src_dims[num_dims - 1] * 2 / movement_size;
+  src_dims[num_dims - 1] = src_dims[num_dims - 1] * elem_size / movement_size;
 
   size_t count = 1;
   for (size_t i = 0; i < num_dims; ++i) {
@@ -340,12 +341,14 @@ void DispatchMovementSize(
     const void* src,
     const int* permutation,
     void* dst,
+    size_t elem_size,
     cudaStream_t stream) {
   void (*func)(
       int64_t* /*src_dims*/,
       const void* /*src*/,
       const int* /*permutation*/,
       void* /*dst*/,
+      size_t /*elem_size*/,
       cudaStream_t /*stream*/) = nullptr;
   if (movement_size == 1) {
     func = DispatchIndexType<num_dims, 1>;
@@ -360,10 +363,10 @@ void DispatchMovementSize(
   } else {
     throw std::runtime_error("unsupported movement_size for permute");
   }
-  func(src_dims, src, permutation, dst, stream);
+  func(src_dims, src, permutation, dst, elem_size, stream);
 }
 
-template <size_t num_dims, size_t elem_size>
+template <size_t num_dims, typename ElemType>
 void invokePermute(
     void* dst,
     const void* src,
@@ -377,10 +380,10 @@ void invokePermute(
     throw std::runtime_error("src is NULL!");
   }
 
-  // 2 bytes/half * 8 halves
+  constexpr size_t elem_size = sizeof(ElemType);
   constexpr size_t kMaxMovementSize = 16;
   const size_t movement_size = GetMovementSize<kMaxMovementSize>(
       elem_size, num_dims, src_dims, src, permutation, dst);
   DispatchMovementSize<num_dims>(
-      movement_size, src_dims, src, permutation, dst, stream);
+      movement_size, src_dims, src, permutation, dst, elem_size, stream);
 }
diff --git a/python/aitemplate/backend/cuda/tensor/permute.py b/python/aitemplate/backend/cuda/tensor/permute.py
index d22041264..c437a257f 100644
--- a/python/aitemplate/backend/cuda/tensor/permute.py
+++ b/python/aitemplate/backend/cuda/tensor/permute.py
@@ -20,8 +20,10 @@
 
 import jinja2
 
-from ... import registry
-from ...target import Target
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.target import Target
 
 FUNC_DECL_TEMPLATE = jinja2.Template(
     """
@@ -60,10 +62,13 @@
 #include <limits>
 #include <stdexcept>
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include "cutlass/arch/memory_sm80.h"
 #include "cutlass/cutlass.h"
 #include "logging.h"
 
+using bfloat16 = __nv_bfloat16;
+
 namespace {
 
 {{custom_libs}}
@@ -86,8 +91,7 @@
 {% endfor %}
   *dim_{{input_rank - 1}}
     };
-
-    invokePermute<{{input_rank}}, {{elem_size}}>(dst, src, src_dims, permutation, stream);
+    invokePermute<{{input_rank}}, {{elem_type}}>(dst, src, src_dims, permutation, stream);
 }
 
   """
@@ -116,13 +120,18 @@ def gen_function(func_attrs: Dict[str, Any]) -> str:
         os.path.dirname(__file__), "permute.cuh"
     )
     dtype = x.dtype()
-    assert dtype == "float16", "permute kernel only supports fp16"
-    elem_size = 2
+    assert dtype in (
+        "float16",
+        "bfloat16",
+        "float32",
+        "float",
+    ), "permute is only tested for floating point type"
+    backend_type = CUDASpec().dtype_to_backend_dtype[dtype]
     return SRC_TEMPLATE.render(
         func_name=func_name,
         custom_libs=custom_libs,
         input_rank=rank,
-        elem_size=elem_size,
+        elem_type=backend_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/permute021.py b/python/aitemplate/backend/cuda/tensor/permute021.py
index c51a7ace1..d53f6e902 100644
--- a/python/aitemplate/backend/cuda/tensor/permute021.py
+++ b/python/aitemplate/backend/cuda/tensor/permute021.py
@@ -16,9 +16,9 @@
 permute021 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute021_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute021_common
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -31,7 +31,10 @@
 
 
 @registry.reg("cuda.permute021.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(
+    func_attrs,
+    template_path,
+):
     """
     Parameters
     ----------
@@ -39,8 +42,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -50,8 +51,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute021_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         CUDASpec(),
     )
@@ -74,7 +73,10 @@ def gen_function_decl(func_attrs):
 
 
 @registry.reg("cuda.permute021.func_call")
-def gen_function_call(func_attrs, indent="  "):
+def gen_function_call(
+    func_attrs,
+    indent="  ",
+):
     """
     Parameters
     ----------
diff --git a/python/aitemplate/backend/cuda/tensor/permute0213.py b/python/aitemplate/backend/cuda/tensor/permute0213.py
new file mode 100644
index 000000000..29143d156
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/permute0213.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute0213 for cuda
+"""
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute0213_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/util/host_tensor.h"
+
+using bfloat16 = __nv_bfloat16;
+"""
+
+
+@registry.reg("cuda.permute0213.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute0213_common.gen_function(
+        func_attrs,
+        template_path,
+        Header_files,
+        CUDASpec(),
+    )
+
+
+@registry.reg("cuda.permute0213.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute0213_common.gen_function_decl(func_attrs, CUDASpec())
+
+
+@registry.reg("cuda.permute0213.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute0213_common.gen_function_call(func_attrs, CUDASpec(), indent)
diff --git a/python/aitemplate/backend/cuda/tensor/permute102.py b/python/aitemplate/backend/cuda/tensor/permute102.py
index 715623e54..498cf4eb2 100644
--- a/python/aitemplate/backend/cuda/tensor/permute102.py
+++ b/python/aitemplate/backend/cuda/tensor/permute102.py
@@ -16,22 +16,25 @@
 permute102 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute102_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute102_common
 
 # pylint: disable=C0301,W0613,W0612
 
 Header_files = """
 #include <cuda_fp16.h>
+#include <cuda_bf16.h>
 #include <cuda_runtime.h>
 #include "cutlass/cutlass.h"
 #include "cutlass/util/host_tensor.h"
+
+using bfloat16 = __nv_bfloat16;
 """
 
 
 @registry.reg("cuda.permute102.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -39,8 +42,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -50,8 +51,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute102_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         CUDASpec(),
     )
diff --git a/python/aitemplate/backend/cuda/tensor/permute210.py b/python/aitemplate/backend/cuda/tensor/permute210.py
index d029277e2..4084bbe6b 100644
--- a/python/aitemplate/backend/cuda/tensor/permute210.py
+++ b/python/aitemplate/backend/cuda/tensor/permute210.py
@@ -16,9 +16,9 @@
 permute210 for cuda
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import permute210_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import permute210_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/cuda/tensor/relational.py b/python/aitemplate/backend/cuda/tensor/relational.py
new file mode 100644
index 000000000..db2f6e89e
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/relational.py
@@ -0,0 +1,208 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+from aitemplate.utils import shape_utils
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+#define N_READS_PER_THREAD sizeof({{output_read_t}}) / sizeof(bool)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+
+void invoke_{{func_name}}(
+    void*,  /* output */
+    const void*,  /* left operand */
+{% if not right_operand.is_a_const_num() %}
+    const void*,   /* right operand */
+{% endif %}
+    {{index_type}}, /* number of elements */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} n_elements = {{calculate_n}};
+    {{indent}} invoke_{{func_name}}(
+    {{indent}}    {{output}},
+    {{indent}}    {{left_operand_name}},
+{% if not right_operand.is_a_const_num() %}
+    {{indent}}    {{right_operand_name}},
+{% endif %}
+    {{indent}}    n_elements,
+    {{indent}}    stream
+    {{indent}});
+{{indent}}}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__ void relational(
+    {{output_read_t}}* output,
+    const {{input_read_t}}* left_operand,
+{% if not right_operand.is_a_const_num() %}
+    const {{input_read_t}}* right_operand,
+{% endif %}
+    {{index_type}} num_elements) {
+
+    const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (idx * N_READS_PER_THREAD >= num_elements) {
+        return;
+    }
+
+    {{input_read_t}} tmp_left = left_operand[idx];
+    {{data_type}}* tmp_left_ptr = reinterpret_cast<{{data_type}}*>(&tmp_left);
+
+    {{output_read_t}} tmp_output;
+    bool* tmp_output_ptr = reinterpret_cast<bool*>(&tmp_output);
+
+{% if not right_operand.is_a_const_num() %}
+    {{input_read_t}} tmp_right = right_operand[idx];
+    {{data_type}}* tmp_right_ptr = reinterpret_cast<{{data_type}}*>(&tmp_right);
+{% endif %}
+
+  #pragma unroll
+    for (int i=0; i < N_READS_PER_THREAD; i++) {
+
+{% if not right_operand.is_a_const_num() %}
+        tmp_output_ptr[i] = (tmp_left_ptr[i] {{operator}} tmp_right_ptr[i]);
+{% else %}
+        tmp_output_ptr[i] = (tmp_left_ptr[i] {{operator}} ({{data_type}})({{right_operand_value}}));
+{% endif %}
+  }
+    output[idx] = tmp_output;
+}
+
+} // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    const void* input_1,
+{% if not right_operand.is_a_const_num() %}
+    const void* input_2,
+{% endif %}
+    {{index_type}} num_elements,
+    {{prefix}}Stream_t stream) {
+
+  int grid_size = static_cast<int>(
+      std::ceil(static_cast<double>(num_elements) / N_THREADS_PER_BLOCK / N_READS_PER_THREAD));
+
+  relational<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+      reinterpret_cast<{{output_read_t}}*>(output),
+      reinterpret_cast<const {{input_read_t}}*>(input_1),
+{% if not right_operand.is_a_const_num() %}
+      reinterpret_cast<const {{input_read_t}}*>(input_2),
+{% endif %}
+      num_elements);
+}
+    """
+)
+
+
+@registry.reg("cuda.relational.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    inputs = func_attrs["inputs"]
+    backend_spec = CUDASpec()
+
+    input_dtype = inputs[0].dtype()
+    input_read_t = backend_spec.get_elementwise_read_backend_type(
+        shape_utils.get_num_rightmost_static_elements(inputs[0].shape()), input_dtype
+    )
+    input_data_t = backend_spec.dtype_to_backend_type(input_dtype)
+    read_vector_length = (
+        backend_spec.sizeof_types[input_read_t]
+        / backend_spec.sizeof_types[input_data_t]
+    )
+    # output data type is bool, which is 1 byte
+    output_read_t = {
+        1: "bool",
+        2: "half",
+        4: "float",
+        8: "int2",
+        16: "int4",
+    }[read_vector_length]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(output_read_t=output_read_t),
+        func_name=func_attrs["name"],
+        data_type=input_data_t,
+        index_type=backend_spec.index_type,
+        operator=func_attrs["func"].value,
+        prefix=backend_spec.prefix,
+        right_operand=func_attrs["args"][1],
+        right_operand_value=str(func_attrs["args"][1]._attrs["value"]),
+        output_read_t=output_read_t,
+        input_read_t=input_read_t,
+    )
+
+
+@registry.reg("cuda.relational.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        right_operand=func_attrs["args"][1],
+    )
+
+
+@registry.reg("cuda.relational.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    args = func_attrs["args"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=y._attrs["name"],
+        left_operand_name=args[0]._attrs["name"],
+        right_operand_name=args[1]._attrs["name"],
+        right_operand=args[1],
+        calculate_n=gen_int_var_product_str(y._attrs["shape"]),
+        indent=indent,
+        index_type=backend_spec.index_type,
+    )
diff --git a/python/aitemplate/backend/cuda/tensor/repeat.cuh b/python/aitemplate/backend/cuda/tensor/repeat.cuh
new file mode 100644
index 000000000..ce0a6fee0
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/repeat.cuh
@@ -0,0 +1,199 @@
+/**
+
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+-
+
+Functions for repeating parts of a CUDA source tensor onto itself
+or into a target tensor.
+
+Used by expand_static_shape.py ( expand operator )
+
+*/
+
+/**
+ * CUDA Kernel to copy elements repeatedly from a source memory
+ * region to a target memory region.
+ */
+__global__ void repeat_head_kernel(
+    const int64_t* const src, ///< source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+                   for head_mem_num_elements*num_repeat_copies int64_t elements.
+               */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies) ///< How many times to repeat it all into data
+{
+  extern __shared__ int64_t
+      shared[]; // preallocated to blockDim.x elements, typically 32
+  const size_t stride_y = blockDim.y * gridDim.y;
+  const size_t stride_x = blockDim.x * gridDim.x;
+
+  // outer grid-stride loop
+  for (size_t ri = blockDim.x * blockIdx.x + threadIdx.x;
+       ri < head_mem_num_elements;
+       ri += stride_x) {
+    // read only with one thread per y dim
+    if (threadIdx.y == 0) {
+      // the following is functionally equivalent to
+      // shared[threadIdx.x] = src[ri]
+      // for reference, see
+      // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#optimizing-cuda-applications
+      // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memcpy-async-primitiv
+      __pipeline_memcpy_async(&shared[threadIdx.x], &src[ri], sizeof(int64_t));
+      __pipeline_commit();
+      __pipeline_wait_prior(0);
+    }
+    __syncthreads(); // wait for shared memory to be populated
+    // inner grid-stride loop, write with all threads out of shared memory
+    size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
+    for (; wi < num_repeat_copies; wi += stride_y) {
+      // Note that this ensures coalesced writes, due to consecutive write
+      // accesses of threads in a Warp
+      data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
+    }
+  }
+}
+
+/**
+ * Copy an 8-byte aligned memory region, which has a byte size that is a
+ * multiple of 8 into an 8-byte aligned target memory region efficiently. Calls
+ * into repeat_head_kernel ( see above )
+ *
+ **/
+__host__ cudaError_t cuda_repeat_head_vectorized(
+    const int64_t* const src, ///< Source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+              for head_mem_num_elements*num_repeat_copies int64_t elements. */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies, ///< How many times to repeat it all into data
+    cudaStream_t stream ///< CUDA stream
+) {
+  size_t threads_x = 32;
+  size_t threads_y = 1024 / threads_x;
+  size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
+  size_t blocks_y = INT_CEIL_DIV(num_repeat_copies, threads_y);
+  size_t serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks if necessary, so we do not exceed available shared
+  // memory
+  blocks_y = INT_CEIL_DIV(
+      blocks_y, serialization_level); // reduce thread count in y dimension
+                                      // first, e.g. sequentialized writes
+  serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks in x direction if this is not sufficient yet
+  blocks_x = INT_CEIL_DIV(blocks_x, serialization_level);
+  dim3 dimGrid(blocks_x, blocks_y);
+  dim3 dimBlock(threads_x, threads_y);
+  repeat_head_kernel<<<
+      dimGrid,
+      dimBlock,
+      threads_x * sizeof(int64_t),
+      stream>>>(src, data, head_mem_num_elements, num_repeat_copies);
+  return cudaPeekAtLastError();
+}
+
+/**
+ * Repeatedly copy the beginning (head) section of a memory region an additonal
+ * num_repeat_copies times nto the memory region directly following that head,
+ * such that the end result will have this head data
+ * repeated 1+num_repeat_copies
+ */
+__host__ cudaError_t cuda_repeat_head(
+    void* data, ///< pointer to CUDA memory of size (at least)
+                ///< head_mem_bytes*(num_repeat_copies+1)
+    const size_t head_mem_bytes, ///< How many bytes to repeat
+    size_t num_repeat_copies, ///< How many times to repeat it (in addition to
+                              ///< the existing head data)
+    cudaStream_t stream ///< CUDA Stream to use
+) {
+  cudaError_t res = cudaSuccess;
+  if (num_repeat_copies == 0)
+    return res;
+  if ((head_mem_bytes % 8) == 0) {
+    // no need to double memory any further if it is 64-bit aligned
+    res = cuda_repeat_head_vectorized(
+        static_cast<const int64_t* const>(data),
+        static_cast<int64_t*>(data) + (head_mem_bytes / 8),
+        head_mem_bytes / 8,
+        num_repeat_copies,
+        stream);
+    if (res != cudaSuccess) {
+      return res;
+    }
+  } else {
+    res = cudaMemcpyAsync(
+        static_cast<void*>(static_cast<uint8_t*>(data) + head_mem_bytes),
+        data,
+        head_mem_bytes,
+        cudaMemcpyDeviceToDevice,
+        stream);
+    if (res != cudaSuccess) {
+      return res;
+    }
+    if (num_repeat_copies >= 2) {
+      // recurse
+      // we have already repeated 1 time, therefore the (num_repeat_copies-1)
+      res = cuda_repeat_head(
+          data, head_mem_bytes * 2, (num_repeat_copies - 1) / 2, stream);
+      if (res != cudaSuccess) {
+        return res;
+      }
+      // deal with possible remainder
+      if (((num_repeat_copies - 1) % 2) == 1) {
+        res = cudaMemcpyAsync(
+            static_cast<void*>(
+                static_cast<uint8_t*>(data) +
+                num_repeat_copies * head_mem_bytes),
+            data,
+            head_mem_bytes,
+            cudaMemcpyDeviceToDevice,
+            stream);
+      }
+    }
+  }
+  return res;
+}
+
+/**
+ * Repeatedly copy a source memory region into a target memory region
+ * such that the end result will have the source data
+ * repeated num_repeat_copies
+ */
+__host__ cudaError_t cuda_repeat_src(
+    const void* const src, ///< Source memory region (readonly)
+    void* data, ///< Destination memory region (read/write, size of at least
+                ///< num_repeat_copies*head_mem_bytes)
+    const size_t head_mem_bytes, ///< Size of source region to copy
+    size_t num_repeat_copies, ///< How many times to copy the data from source
+                              ///< into data
+    cudaStream_t stream ///< CUDA stream to use
+) {
+  cudaError_t res = cudaSuccess;
+  if (num_repeat_copies == 0) {
+    return res;
+  }
+
+  res = cudaMemcpyAsync(
+      data, src, head_mem_bytes, cudaMemcpyDeviceToDevice, stream);
+  if ((res != cudaSuccess) || (num_repeat_copies == 1)) {
+    return res;
+  }
+  return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
+}
diff --git a/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
index 15d54efc5..c1552ddb9 100644
--- a/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/backend/cuda/tensor/slice_reshape_scatter.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_reshape_scatter_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_reshape_scatter_common
 
 OUTPUT_DIM_DEF_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/cuda/tensor/slice_scatter.py b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
index f42524c01..8cf968415 100644
--- a/python/aitemplate/backend/cuda/tensor/slice_scatter.py
+++ b/python/aitemplate/backend/cuda/tensor/slice_scatter.py
@@ -16,9 +16,9 @@
 Slice scatter CUDA implementation.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("cuda.slice_scatter.func_decl")
@@ -53,8 +53,14 @@ def gen_function(func_attrs):
     """
     # TODO: consider to profile elems_per_thread
     elems_per_thread = 8 if len(func_attrs["inputs"]) == 1 else 256
+    output_accessor = func_attrs["output_accessors"][0]
+    output_offset = output_accessor.offset
     return slice_common.gen_function(
-        func_attrs, backend_spec=CUDASpec(), elems_per_thread=elems_per_thread
+        func_attrs,
+        backend_spec=CUDASpec(),
+        elems_per_thread=elems_per_thread,
+        output_offset=output_offset,
+        update_output_shape=False,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/tensor/split.py b/python/aitemplate/backend/cuda/tensor/split.py
index b0bf6c531..257571158 100644
--- a/python/aitemplate/backend/cuda/tensor/split.py
+++ b/python/aitemplate/backend/cuda/tensor/split.py
@@ -16,9 +16,9 @@
 CUDA concatenate function
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import split_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import split_common
 
 
 @registry.reg("cuda.split.func_decl")
diff --git a/python/aitemplate/backend/cuda/tensor/topk.py b/python/aitemplate/backend/cuda/tensor/topk.py
index 36916d4e2..f6046c7c6 100644
--- a/python/aitemplate/backend/cuda/tensor/topk.py
+++ b/python/aitemplate/backend/cuda/tensor/topk.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common.tensor import topk_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.tensor import topk_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/tensor/where.py b/python/aitemplate/backend/cuda/tensor/where.py
new file mode 100644
index 000000000..dfc1acdcb
--- /dev/null
+++ b/python/aitemplate/backend/cuda/tensor/where.py
@@ -0,0 +1,228 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.elementwise_common import gen_int_var_product_str
+from aitemplate.utils import shape_utils
+
+
+CUDA_HEADER_FILES = """
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+#define N_READS_PER_THREAD sizeof({{condition_read_t}}) / sizeof(bool)
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+
+void invoke_{{func_name}}(
+    void*,  /* output */
+    const void*,  /* condition */
+{% if not input_tensor_is_a_const_num %}
+    const void*,  /* input tensor */
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const void*,   /* other tensor */
+{% endif %}
+    {{index_type}}, /* number of elements */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+    {{indent}}{{index_type}} n_elements = {{calculate_n}};
+    {{indent}} invoke_{{func_name}}(
+    {{indent}}    {{output}},
+    {{indent}}    {{condition}},
+{% if not input_tensor_is_a_const_num %}
+    {{indent}}    {{input_tensor}},
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    {{indent}}    {{other_tensor}},
+{% endif %}
+    {{indent}}    n_elements,
+    {{indent}}    stream
+    {{indent}});
+{{indent}}}
+    """
+)
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+
+__global__ void where(
+    {{read_t}}* output,
+    const {{condition_read_t}}* condition,
+{% if not input_tensor_is_a_const_num %}
+    const {{read_t}}* input_tesnor,
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const {{read_t}}* other_tensor,
+{% endif %}
+    {{index_type}} num_elements) {
+        const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+        if (idx * N_READS_PER_THREAD >= num_elements) {
+            return;
+        }
+
+        {{read_t}} tmp_output;
+        {{data_t}}* tmp_output_ptr = reinterpret_cast<{{data_t}}*>(&tmp_output);
+
+        {{condition_read_t}} tmp_condition = condition[idx];
+        bool* tmp_condition_ptr = reinterpret_cast<bool*>(&tmp_condition);
+
+{% if not input_tensor_is_a_const_num %}
+        {{read_t}} tmp_input_tensor = input_tesnor[idx];
+        {{data_t}}* tmp_input_tensor_ptr = reinterpret_cast<{{data_t}}*>(&tmp_input_tensor);
+{% endif %}
+
+{% if not other_tensor_is_a_const_num %}
+        {{read_t}} tmp_other_tensor = other_tensor[idx];
+        {{data_t}}* tmp_other_tensor_ptr = reinterpret_cast<{{data_t}}*>(&tmp_other_tensor);
+{% endif %}
+
+#pragma unroll
+        for (int i=0; i < N_READS_PER_THREAD; i++) {
+            tmp_output_ptr[i] = ({{data_t}})(tmp_condition_ptr[i]) * ({{data_t}})({{ input_tensor_val if input_tensor_is_a_const_num else "tmp_input_tensor_ptr[i]" }}) + ({{data_t}})(1 - tmp_condition_ptr[i]) * ({{data_t}})({{ other_tensor_val if other_tensor_is_a_const_num else "tmp_other_tensor_ptr[i]" }});
+        }
+        output[idx] = tmp_output;
+
+    }
+
+} // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    const void* condition,
+{% if not input_tensor_is_a_const_num %}
+    const void* input_tesnor,
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+    const void* other_tensor,
+{% endif %}
+    {{index_type}} num_elements,
+    {{prefix}}Stream_t stream) {
+
+  int grid_size = static_cast<int>(
+      std::ceil(static_cast<double>(num_elements) / N_THREADS_PER_BLOCK / N_READS_PER_THREAD));
+
+  where<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(
+      reinterpret_cast<{{read_t}}*>(output),
+      reinterpret_cast<const {{condition_read_t}}*>(condition),
+{% if not input_tensor_is_a_const_num %}
+      reinterpret_cast<const {{read_t}}*>(input_tesnor),
+{% endif %}
+{% if not other_tensor_is_a_const_num %}
+      reinterpret_cast<const {{read_t}}*>(other_tensor),
+{% endif %}
+      num_elements);
+}
+    """
+)
+
+
+@registry.reg("cuda.where.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    condition, input_tensor, other_tensor = func_attrs["args"]
+    output = func_attrs["outputs"][0]
+    dtype = output.dtype()
+    backend_spec = CUDASpec()
+    read_t = backend_spec.get_elementwise_read_backend_type(
+        shape_utils.get_num_rightmost_static_elements(output.shape()), dtype
+    )
+    data_t = backend_spec.dtype_to_backend_type(dtype)
+    read_vector_length = (
+        backend_spec.sizeof_types[read_t] / backend_spec.sizeof_types[data_t]
+    )
+    # condition data type is bool, which is 1 byte
+    condition_read_t = {
+        1: "bool",
+        2: "half",
+        4: "float",
+        8: "int2",
+        16: "int4",
+    }[read_vector_length]
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=CUDA_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(condition_read_t=condition_read_t),
+        func_name=func_attrs["name"],
+        data_t=data_t,
+        read_t=read_t,
+        condition_read_t=condition_read_t,
+        index_type=backend_spec.index_type,
+        prefix=backend_spec.prefix,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+        input_tensor_val=str(input_tensor._attrs["value"]),
+        other_tensor_val=str(other_tensor._attrs["value"]),
+    )
+
+
+@registry.reg("cuda.where.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    _, input_tensor, other_tensor = func_attrs["args"]
+    backend_spec = CUDASpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+        index_type=backend_spec.index_type,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+    )
+
+
+@registry.reg("cuda.where.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    condition, input_tensor, other_tensor = func_attrs["args"]
+    output = func_attrs["outputs"][0]
+    backend_spec = CUDASpec()
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output._attrs["name"],
+        condition=condition._attrs["name"],
+        input_tensor=input_tensor._attrs["name"],
+        other_tensor=other_tensor._attrs["name"],
+        calculate_n=gen_int_var_product_str(condition.shape()),
+        indent=indent,
+        index_type=backend_spec.index_type,
+        input_tensor_is_a_const_num=input_tensor.is_a_const_num(),
+        other_tensor_is_a_const_num=other_tensor.is_a_const_num(),
+    )
diff --git a/python/aitemplate/backend/cuda/upsample/__init__.py b/python/aitemplate/backend/cuda/upsample/__init__.py
index 98b87b6d8..f7fa8ce45 100644
--- a/python/aitemplate/backend/cuda/upsample/__init__.py
+++ b/python/aitemplate/backend/cuda/upsample/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA upsampling module init
 """
-from . import upsampling2d, upsampling2d_add
+from aitemplate.backend.cuda.upsample import upsampling2d, upsampling2d_add
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d.py b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
index b0acd9c61..d25912eab 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d.py
@@ -16,17 +16,20 @@
 Codegen functions for upsampling2d.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
 
 Header_Files = """
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include "cutlass/util/host_tensor.h"
+
+using bfloat16 = __nv_bfloat16;
 """
 
 
@@ -34,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -42,7 +45,7 @@ def gen_function(
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -65,7 +68,7 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = upsampling2d_common.EXEC_TEMPLATE.render(dtype=input_type)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
@@ -77,6 +80,7 @@ def gen_function(
         half2_data_ref=half2_data_ref,
         mode=func_attrs["mode"],
         tsize=upsampling2d_common.gen_alignment(x),
+        dtype=input_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
index f369a3ed2..5daa3b74c 100644
--- a/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/cuda/upsample/upsampling2d_add.py
@@ -16,9 +16,9 @@
 Codegen functions for upsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import CUDASpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -42,7 +42,7 @@ def gen_function(
     exec_path = func_attrs["exec_path"]
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -67,7 +67,7 @@ def gen_function(
         program = upsampling2d_common.EXEC_TEMPLATE.render(
             bias_add=True, dtype=input_type
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
@@ -80,6 +80,7 @@ def gen_function(
         mode=func_attrs["mode"],
         bias_add=True,
         tsize=upsampling2d_common.gen_alignment(x),
+        dtype=input_type,
     )
 
 
diff --git a/python/aitemplate/backend/cuda/utils.py b/python/aitemplate/backend/cuda/utils.py
index ada46c404..e87b2a4bb 100644
--- a/python/aitemplate/backend/cuda/utils.py
+++ b/python/aitemplate/backend/cuda/utils.py
@@ -15,15 +15,22 @@
 """
 Util functions for CUDA codegen.
 """
-from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
+import logging
 
-from ...utils import logger
-from .. import registry
+from aitemplate.backend import registry
+from aitemplate.utils.environ import (
+    allow_cutlass_sm90_kernels,
+    force_cutlass_sm90_kernels,
+)
+from aitemplate.utils.mk_cutlass_lib.mk_cutlass_lib import mk_cutlass_lib
 
 # pylint: disable=C0103,C0415,W0707
 
 
-class Args(object):
+_LOGGER = logging.getLogger(__name__)
+
+
+class Args:
     def __init__(self, arch):
         self.operations = "all"
         self.build_dir = ""
@@ -37,27 +44,43 @@ def __init__(self, arch):
         self.selected_kernel_list = None
         self.interface_dir = None
         self.filter_by_cc = True
+        self.disable_full_archs_compilation = False
 
 
 registry.reg("cuda.make_cutlass_lib")(mk_cutlass_lib)
 
 
 @registry.reg("cuda.gen_cutlass_ops")
-def gen_ops(arch):
+def gen_ops(arch, cuda_version):
     import cutlass_lib
 
     args = Args(arch)
+    if cuda_version is not None:
+        args.cuda_version = cuda_version
     manifest = cutlass_lib.manifest.Manifest(args)
-    try:
-        func = getattr(cutlass_lib.generator, "GenerateSM" + arch)
-        func(manifest, args.cuda_version)
-    except AttributeError as e:
-        raise NotImplementedError(
-            "Arch " + arch + " is not supported by current cutlass lib."
-        ) from e
-    try:
-        func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
-        func(manifest, args)
-    except AttributeError:
-        logger.warning(__file__, "Arch " + arch + " is not supported by extra ops.")
+
+    if arch == "90":
+        if force_cutlass_sm90_kernels():
+            cutlass_lib.generator.GenerateSM90(manifest, args.cuda_version)
+        elif allow_cutlass_sm90_kernels():
+            cutlass_lib.generator.GenerateSM90(manifest, args.cuda_version)
+            cutlass_lib.generator.GenerateSM80(manifest, args.cuda_version)
+            cutlass_lib.extra_operation.GenerateSM80(manifest, args)
+        else:
+            cutlass_lib.generator.GenerateSM80(manifest, args.cuda_version)
+            cutlass_lib.extra_operation.GenerateSM80(manifest, args)
+    else:
+        try:
+            func = getattr(cutlass_lib.generator, "GenerateSM" + arch)
+            func(manifest, args.cuda_version)
+        except AttributeError as e:
+            raise NotImplementedError(
+                "Arch " + arch + " is not supported by current cutlass lib."
+            ) from e
+        try:
+            func = getattr(cutlass_lib.extra_operation, "GenerateSM" + arch)
+            func(manifest, args)
+        except AttributeError:
+            _LOGGER.warning("Arch " + arch + " is not supported by extra ops.")
+
     return manifest.operations
diff --git a/python/aitemplate/backend/cuda/view_ops/__init__.py b/python/aitemplate/backend/cuda/view_ops/__init__.py
index bc232e36a..b2be80a1a 100644
--- a/python/aitemplate/backend/cuda/view_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/view_ops/__init__.py
@@ -15,6 +15,9 @@
 """
 CUDA view_ops module init
 """
-from . import view_ops
+from aitemplate.backend.cuda.view_ops import make_jagged, view_ops
 
-__all__ = ["view_ops"]
+__all__ = [
+    "view_ops",
+    "make_jagged",
+]
diff --git a/python/aitemplate/backend/cuda/view_ops/make_jagged.py b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
new file mode 100644
index 000000000..9e17a0503
--- /dev/null
+++ b/python/aitemplate/backend/cuda/view_ops/make_jagged.py
@@ -0,0 +1,357 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for the make_jagged op.
+
+The main responsibilities of the make_jagged backend are:
+
+  1. Associate the offsets structure members (lengths and data)
+  with the corresponding rank-1 offsets Tensors' first dimension
+  and data pointer, respectively.
+
+  2. Check the validity of the offset content (non-strict
+  monotonicity, first and last values in each array). Offset
+  contents are on the device, hence are checked by a simple
+  CUDA kernel doing an assertion for each constraint. Some
+  of the constraints can be checked on the device, in which
+  case an std::runtime_error is thrown on violation.
+"""
+from typing import Set
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar
+
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <assert.h>
+#include <stdexcept>
+
+#include "jagged.h"
+
+
+#define THREADS_PER_BLOCK 128
+
+
+namespace {
+
+struct OffsetBounds {
+  {{offsets_type}} min_values[{{num_offsets}}]{0};
+  {{offsets_type}} max_values[{{num_offsets}}]{0};
+  {{offsets_type}} last_values[{{num_offsets}}]{0};
+};
+
+__global__ void check_offsets(
+  {{offsets_struct_type}} offsets,
+  OffsetBounds bounds
+) {
+  {{index_type}} dim_id = blockIdx.y;
+  {{index_type}} offset_id = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  {{index_type}} length = offsets.lengths[dim_id];
+  const {{offsets_type}}* data = offsets.data[dim_id];
+
+  if (offset_id >= length - 1) {
+    // out of bounds of the offset array
+    return;
+  }
+
+{% if check_sequence_lengths %}
+  {{offsets_type}} group_size = data[offset_id + 1] - data[offset_id];
+  if (group_size < bounds.min_values[dim_id] || group_size > bounds.max_values[dim_id]) {
+    printf(
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
+      "Error: the offset difference %d is out of bounds of the jagged dimension %d (min: %d, max: %d).",
+      (int32_t)blockIdx.x,
+      (int32_t)blockIdx.y,
+      (int32_t)blockIdx.z,
+      (int32_t)threadIdx.x,
+      (int32_t)threadIdx.y,
+      (int32_t)threadIdx.z,
+      (int32_t)group_size,
+      (int32_t)dim_id,
+      (int32_t)bounds.min_values[dim_id],
+      (int32_t)bounds.max_values[dim_id]
+    );
+    __trap();
+  }
+{% endif %}
+
+  if (offset_id == 0) {
+    {{offsets_type}} first_offset = data[0];
+    if (first_offset != 0)
+    {
+      printf(
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
+        "Error: the first offset of the jagged dimension %d is non-zero: %d.",
+        (int32_t)blockIdx.x,
+        (int32_t)blockIdx.y,
+        (int32_t)blockIdx.z,
+        (int32_t)threadIdx.x,
+        (int32_t)threadIdx.y,
+        (int32_t)threadIdx.z,
+        (int32_t)dim_id,
+        (int32_t)first_offset
+      );
+      __trap();
+    }
+  }
+
+  if (offset_id == length - 2) {
+    {{offsets_type}} last_offset = data[length - 1];
+    if (last_offset != bounds.last_values[dim_id])
+    {
+      printf(
+      "\\n[func name: {{func_name}}, block: [%d, %d, %d], thread: [%d, %d, %d]]: "
+        "Error: the last offset of the jagged dimension %d is incorrect: %d (must be %d).",
+        (int32_t)blockIdx.x,
+        (int32_t)blockIdx.y,
+        (int32_t)blockIdx.z,
+        (int32_t)threadIdx.x,
+        (int32_t)threadIdx.y,
+        (int32_t)threadIdx.z,
+        (int32_t)dim_id,
+        (int32_t)last_offset,
+        (int32_t)bounds.last_values[dim_id]
+      );
+      __trap();
+    }
+  }
+}
+
+} // namespace
+
+
+void {{func_name}}(
+{% for idx in range(num_offsets) %}
+  {{index_type}} offsets_length_{{idx}},
+  const void* offsets_data_{{idx}},
+{% endfor %}
+{% for name in jagged_dynamic_bound_names %}
+  {{index_type}} {{name}},
+{% endfor %}
+  {{offsets_struct_type}}& offsets,
+  {{index_type}}* batch_dim,
+  {{index_type}} total_length,
+  cudaStream_t stream
+) {
+{% for idx in range(num_offsets) %}
+    offsets.lengths[{{idx}}] = offsets_length_{{idx}};
+    offsets.data[{{idx}}] = reinterpret_cast<const {{offsets_type}}*>(offsets_data_{{idx}});
+{% endfor %}
+
+{% if isolated_batch_dim %}
+    // batch_dim is not present in any input shape
+    // we should set it here from the offsets length
+    *batch_dim = offsets.lengths[0] - 1;
+{% else %}
+    if (*batch_dim != offsets.lengths[0] - 1) {
+        // batch_dim must have been set before this code
+        throw std::runtime_error("batch_dim != len(offsets[0]) - 1");
+    }
+{% endif %}
+
+    {{index_type}} max_offset_length = 0;
+    for (int i = 0; i < {{num_offsets}}; ++i) {
+        if (offsets.lengths[i] <= 1) {
+            throw std::runtime_error("offset array's length must be at least 2");
+        }
+        if (offsets.lengths[i] > max_offset_length) {
+            max_offset_length = offsets.lengths[i];
+        }
+    }
+
+    OffsetBounds bounds;
+{% for idx in range(num_offsets) %}
+    bounds.min_values[{{idx}}] = {{jagged_dim_min_values[idx]}};
+    bounds.max_values[{{idx}}] = {{jagged_dim_max_values[idx]}};
+    bounds.last_values[{{idx}}] = {{ "offsets.lengths[" + ((idx + 1) | string) + "] - 1" if idx < num_offsets - 1 else "total_length" }};
+{% endfor %}
+
+    dim3 grid_size((max_offset_length - 1 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, {{num_offsets}});
+    check_offsets<<<grid_size, THREADS_PER_BLOCK, 0, stream>>>(offsets, bounds);
+}
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+{% for idx in range(num_offsets) %}
+  {{index_type}},
+  const void*,
+{% endfor %}
+{% for _ in range(num_jagged_dynamic_bound_dims) %}
+  {{index_type}},
+{% endfor %}
+  {{offsets_struct_type}}&,
+  {{index_type}}*,
+  {{index_type}},
+  cudaStream_t
+);
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{% for idx in range(num_offsets) %}
+{{indent}}  {{offsets_first_dim_names[idx]}},
+{{indent}}  {{offsets_data_names[idx]}},
+{% endfor %}
+{% for name in jagged_dynamic_bound_names %}
+{{indent}}  {{name}},
+{% endfor %}
+{{indent}}  {{offsets_var_name}},
+{{indent}}  &{{batch_dim_name}},
+{{indent}}  {{total_length_name}},
+{{indent}}  stream
+{{indent}});
+""",
+    trim_blocks=True,
+    lstrip_blocks=True,
+)
+
+
+def _get_jagged_dynamic_bound_dims(jagged_int_var: JaggedIntVar) -> Set[IntVar]:
+    """Get the set of dynamic dims in JaggedIntVar's JaggedDims' min / max values."""
+    return set(
+        [
+            dim.min_value()
+            for dim in jagged_int_var.jagged_dims()
+            if type(dim.min_value()) == IntVar
+        ]
+        + [
+            dim.max_value()
+            for dim in jagged_int_var.jagged_dims()
+            if type(dim.max_value()) == IntVar
+        ]
+    )
+
+
+@registry.reg("cuda.make_jagged.gen_function")
+def make_jagged_gen_function(func_attrs):
+    func_name = func_attrs["name"]
+    num_sources = func_attrs["num_sources"]
+    offsets_list = func_attrs["inputs"][num_sources:]
+    backend_spec = CUDASpec()
+
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+
+    jagged_dim_min_values = [
+        dim.min_value().value()
+        if isinstance(dim.min_value(), IntImm)
+        else dim.min_value()._attrs["name"]
+        for dim in jagged_int_var.jagged_dims()
+    ]
+    jagged_dim_max_values = [
+        dim.max_value().value()
+        if isinstance(dim.max_value(), IntImm)
+        else dim.max_value()._attrs["name"]
+        for dim in jagged_int_var.jagged_dims()
+    ]
+
+    jagged_dynamic_bound_dims = _get_jagged_dynamic_bound_dims(jagged_int_var)
+    jagged_dynamic_bound_names = [
+        dim._attrs["name"] for dim in jagged_dynamic_bound_dims
+    ]
+
+    for dim in jagged_dynamic_bound_dims:
+        if dim._attrs.get("isolated", False):
+            raise ValueError(
+                "Dynamic dimension (IntVar) in the min / max value "
+                "of a JaggedDim in the JaggedIntVar is isolated "
+                f"(not present in any input shape): {jagged_int_var}."
+            )
+
+    batch_dim = jagged_int_var.batch_dim()
+    isolated_batch_dim = batch_dim._attrs.get("isolated", False)
+    check_sequence_lengths = func_attrs["check_sequence_lengths"]
+
+    return SRC_TEMPLATE.render(
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        offsets_struct_type=offsets_struct_type,
+        jagged_dim_min_values=jagged_dim_min_values,
+        jagged_dim_max_values=jagged_dim_max_values,
+        offsets_type=jagged_int_var.offsets_type(),
+        isolated_batch_dim=isolated_batch_dim,
+        jagged_dynamic_bound_names=jagged_dynamic_bound_names,
+        index_type=backend_spec.index_type,
+        check_sequence_lengths=check_sequence_lengths,
+    )
+
+
+@registry.reg("cuda.make_jagged.func_decl")
+def make_jagged_gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    num_sources = func_attrs["num_sources"]
+    offsets_list = func_attrs["inputs"][num_sources:]
+    backend_spec = CUDASpec()
+
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+    offsets_struct_type = jagged_int_var.offsets_struct_type()
+    jagged_dynamic_bound_dims = _get_jagged_dynamic_bound_dims(jagged_int_var)
+
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        offsets_struct_type=offsets_struct_type,
+        num_jagged_dynamic_bound_dims=len(jagged_dynamic_bound_dims),
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("cuda.make_jagged.func_call")
+def make_jagged_gen_function_call(func_attrs, indent="  "):
+    func_name = func_attrs["name"]
+    num_sources = func_attrs["num_sources"]
+    total_length = func_attrs["inputs"][0]._attrs["shape"][0]
+    offsets_list = func_attrs["inputs"][num_sources:]
+    output = func_attrs["outputs"][0]
+    jagged_int_var = output._attrs["shape"][0]
+
+    offsets_first_dim_names = [
+        offsets._attrs["shape"][0]._attrs["name"] for offsets in offsets_list
+    ]
+    offsets_data_names = [offsets._attrs["name"] for offsets in offsets_list]
+    batch_dim_name = jagged_int_var.batch_dim()._attrs["name"]
+    total_length_name = total_length._attrs["name"]
+
+    jagged_dynamic_bound_names = [
+        dim._attrs["name"] for dim in _get_jagged_dynamic_bound_dims(jagged_int_var)
+    ]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent="      ",
+        func_name=func_name,
+        num_offsets=len(offsets_list),
+        offsets_var_name=jagged_int_var.offsets_var_name(),
+        offsets_first_dim_names=offsets_first_dim_names,
+        offsets_data_names=offsets_data_names,
+        batch_dim_name=batch_dim_name,
+        total_length_name=total_length_name,
+        jagged_dynamic_bound_names=jagged_dynamic_bound_names,
+    )
diff --git a/python/aitemplate/backend/cuda/view_ops/view_ops.py b/python/aitemplate/backend/cuda/view_ops/view_ops.py
index 792f7b1de..63f06765a 100644
--- a/python/aitemplate/backend/cuda/view_ops/view_ops.py
+++ b/python/aitemplate/backend/cuda/view_ops/view_ops.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from ....backend import registry
+from aitemplate.backend import registry
 
 SRC_TEMPLATE = jinja2.Template(
     """
@@ -92,7 +92,7 @@ def _is_intvar(func_attrs):
 @registry.reg("cuda.reshape.gen_function")
 @registry.reg("cuda.flatten.gen_function")
 def reshape_gen_function(func_attrs, shape_eval_template):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     unknown_idx = func_attrs["unknown_idx"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     if _is_intvar(func_attrs):
@@ -120,7 +120,7 @@ def reshape_gen_function(func_attrs, shape_eval_template):
 @registry.reg("cuda.reshape.func_decl")
 @registry.reg("cuda.flatten.func_decl")
 def reshape_gen_function_decl(func_attrs):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     if _is_intvar(func_attrs):
         input_ndim = len(func_attrs["inputs"]) - 1
@@ -134,7 +134,7 @@ def reshape_gen_function_decl(func_attrs):
 @registry.reg("cuda.reshape.func_call")
 @registry.reg("cuda.flatten.func_call")
 def reshape_gen_function_call(func_attrs, indent="  "):
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_names = []
     if _is_intvar(func_attrs):
         for i, inp in enumerate(func_attrs["inputs"]):
@@ -171,7 +171,7 @@ def squeeze_gen_function(func_attrs, shape_eval_template):
     shape_eval_template : jinja2.Template
         The template that implements the logic for writing to dynamic shapes.
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     out_dim_to_in = func_attrs["out_dim_to_in"]
 
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
@@ -206,7 +206,7 @@ def squeeze_gen_function_decl(func_attrs):
     func_attrs : Dict[str, Any]
         The _attrs dict from the original op.
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_ndim = len(func_attrs["inputs"][0]._attrs["shape"])
     output_ndim = len(func_attrs["outputs"][0]._attrs["shape"])
 
@@ -227,7 +227,7 @@ def squeeze_gen_function_call(func_attrs, indent="  "):
     ident : str
         Sequence to use to generate the indentations in the CUDA code
     """
-    func_name = func_attrs["name"]
+    func_name = "ait_" + func_attrs["name"]
     input_names = [
         shape._attrs["name"] for shape in func_attrs["inputs"][0]._attrs["shape"]
     ]
diff --git a/python/aitemplate/backend/cuda/vision_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/__init__.py
index 5cdda2760..82940f11a 100644
--- a/python/aitemplate/backend/cuda/vision_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/__init__.py
@@ -17,5 +17,5 @@
 """
 # flake8: noqa
 
-from .nms import *
-from .roi_ops import *
+from aitemplate.backend.cuda.vision_ops.nms import *
+from aitemplate.backend.cuda.vision_ops.roi_ops import *
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
index 280f1ada0..4f47cf2d8 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/__init__.py
@@ -15,4 +15,8 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import batched_nms, efficient_nms, nms  # noqa
+from aitemplate.backend.cuda.vision_ops.nms import (  # noqa
+    batched_nms,
+    efficient_nms,
+    nms,
+)
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
index f5efb3df0..3c83d0003 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/batched_nms.py
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
index c7ecca653..89ab885d9 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/efficient_nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import efficient_nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import efficient_nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/nms/nms.py b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
index ac4780747..a4d7f6839 100644
--- a/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
+++ b/python/aitemplate/backend/cuda/vision_ops/nms/nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
index bbdaf07a4..5959e1a3b 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 CUDA roi_align module init
 """
-from . import multi_level_roi_align, roi_align
+from aitemplate.backend.cuda.vision_ops.roi_ops import multi_level_roi_align, roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
index d94d484ed..a2aa16bb7 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/multi_level_roi_align.py
@@ -17,9 +17,9 @@
 """
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import multi_level_roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import multi_level_roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -36,7 +36,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -45,8 +45,8 @@ def gen_function(
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     backend_spec = CUDASpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
 
     exec_paths = ""
     for key, _ in exec_path.items():
@@ -58,8 +58,10 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return multi_level_roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
index 1eb9dedd2..3754f660e 100644
--- a/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/cuda/vision_ops/roi_ops/roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import CUDASpec
-from ....common.vision_ops import roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import CUDASpec
+from aitemplate.backend.common.vision_ops import roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -46,7 +46,7 @@ def gen_function(
 
     x = func_attrs["inputs"][0]
     backend_spec = CUDASpec()
-    library_dtype = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
+    dtype = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
     half2_data_ref = backend_spec.half2_data_ref
 
     shape_eval_func = shape_eval_template.render(
@@ -78,9 +78,9 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
-            library_dtype=library_dtype,
+            dtype=dtype,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
@@ -90,6 +90,7 @@ def gen_function(
         header_files=EXTRA_HEADER.render(),
         index_type=backend_spec.index_type,
         half2_data_ref=half2_data_ref,
+        dtype=dtype,
     )
 
 
diff --git a/python/aitemplate/backend/main_templates.py b/python/aitemplate/backend/main_templates.py
index 88af849a7..fa7705182 100644
--- a/python/aitemplate/backend/main_templates.py
+++ b/python/aitemplate/backend/main_templates.py
@@ -17,7 +17,6 @@
 """
 import jinja2
 
-
 MODEL_TEMPLATE = jinja2.Template(
     """
 #pragma once
@@ -28,37 +27,22 @@
 #include "device_functions-generated.h"
 #include "model_interface.h"
 #include "raii_wrapper.h"
+#include "model.h"
 #include "macros.h"
+#include "jagged.h"
 #include <algorithm>
 #include <deque>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <math.h>
+#include <iomanip>
 
 {{ function_decl }}
 
-#define CHECK_VECTOR_ACCESS(vector, idx)                                  \\
-  if (idx >= vector.size()) {                                             \\
-    throw std::out_of_range(                                              \\
-        "[__func__]: index out of range, " #vector ".size()=" +           \\
-        std::to_string(vector.size()) + ", got " + std::to_string(idx));  \\
-  }
-
 namespace ait {
-namespace {
-void DeviceCheckLastError(const char* file, int line) {
-  auto device_error = GetLastError();
-  if (device_error != GetDeviceSuccess()) {
-    std::string msg = std::string("Got error: ") + GetLastErrorString() +
-                      " enum: " + std::to_string(device_error) +
-                      " at " + file + ": " + std::to_string(line);
-    LOG(ERROR) << msg;
-    throw std::runtime_error(msg);
-  }
-}
-
-thread_local bool target_has_graph_mode = {{ target_has_graph_mode }};
-} // namespace
 
 // Model is the class that actually performs inference. It owns memory for
 // intermediate tensors and dynamic dimensions. Constants are owned by
@@ -66,305 +50,268 @@
 // by the user.
 // Once an inference run has started, it is not safe to re-use the Model
 // until the run has finished!
-class Model {
-  public:
-  Model(
-      size_t blob_size,
-      size_t workspace_size,
-      size_t num_inputs,
-      size_t num_outputs,
-      size_t num_unbound_constants,
-      uint8_t* constants,
-      AITemplateAllocator& allocator)
-      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
-        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
-        params_(num_inputs + num_outputs + num_unbound_constants),
-        num_inputs_(num_inputs),
-        num_outputs_(num_outputs),
-        constants_(constants) {
-      dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
-      LOG(INFO) << "Init AITemplate Runtime.";
-      global_workspace_ = static_cast<uint8_t*>(workspace_.get()) + {{ unique_workspace_size }};
-      unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
-      DEVICE_CHECK(GetDevice(&device_idx_))
-      DEVICE_CHECK(CreateEvent(&run_finished_));
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
-      DEVICE_CHECK(cudaDeviceGetAttribute(
-        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
-#endif
-      DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
-      DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
-  InitConstants(constants_);
-      auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
-  {{ tensor_slice }}
-  {{ tensor_map_set }}
-    }
+class {{model_name}} : public ModelBase<{{model_name}}> {
+  {% if n_additional_streams > 0 %}
+  // Extra streams allocated for graph or fork-join streams.
+  static constexpr size_t N_SUB_STREAMS = {{n_additional_streams}};
+  StreamType sub_streams[N_SUB_STREAMS];
+  {% endif %}
+  {% if n_additional_events > 0 %}
+  // Extra events allocated for graph or fork-join streams.
+  static constexpr size_t N_SUB_EVENTS = {{n_additional_events}};
+  EventType sub_events[N_SUB_EVENTS];
+  // An event that guards the fork operation for the base stream.
+  EventType sub_event_base;
+  {% endif %}
 
-    ~Model() {
-      if (run_finished_ != nullptr) {
-        DestroyEvent(run_finished_);
-      }
-      if (graph_capture_stream_ != nullptr) {
-        StreamDestroy(graph_capture_stream_);
+  public:
+    {{model_name}}(
+        size_t blob_size,
+        size_t workspace_size,
+        size_t unique_workspace_size,
+        size_t num_inputs,
+        size_t num_outputs,
+        size_t num_unbound_constants,
+        uint8_t* constants,
+        AITemplateAllocator& allocator)
+        : ModelBase(
+            blob_size,
+            workspace_size,
+            unique_workspace_size,
+            num_inputs,
+            num_outputs,
+            num_unbound_constants,
+            constants,
+            allocator) {
+    {{ set_up_constants }}
+    auto* blob_ptr = static_cast<uint8_t*>(blob_.get());
+    {{ tensor_slice }}
+    {{ tensor_map_set }}
+    {{ set_up_param_dynamic_shapes }}
+
+      {% if n_additional_streams > 0 %}
+      for (size_t i = 0; i < N_SUB_STREAMS; i++) {
+        DEVICE_CHECK(StreamCreate(sub_streams + i, true));
       }
-      if (graph_exec_ != nullptr) {
-        GraphExecDestroy(graph_exec_);
+      {% endif %}
+      {% if n_additional_events > 0 %}
+      for (size_t i = 0; i < N_SUB_EVENTS; i++) {
+        DEVICE_CHECK(CreateEvent(sub_events + i, false));
       }
+      DEVICE_CHECK(CreateEvent(&sub_event_base, false));
+      {% endif %}
     }
 
-    Model(Model&& other) {
-      run_finished_ = other.run_finished_;
-      graph_exec_ = other.graph_exec_;
-      graph_capture_stream_ = other.graph_capture_stream_;
-      other.run_finished_ = nullptr;
-      other.graph_exec_ = nullptr;
-      other.graph_capture_stream_ = nullptr;
-
-      constants_ = other.constants_;
-      num_inputs_ = other.num_inputs_;
-      global_workspace_ = other.global_workspace_;
-      unique_workspace_ = other.unique_workspace_;
-      workspace_ = std::move(other.workspace_);
-
-      params_ = std::move(other.params_);
-      constant_name_to_ptr_ = std::move(other.constant_name_to_ptr_);
-      // Re-wire the pointers in the above 2 structures.
-      InitConstants(constants_);
+    ~{{model_name}}() {
+      {% if n_additional_streams > 0 %}
+      for (size_t i = 0; i < N_SUB_STREAMS; i++) {
+        DEVICE_CHECK(StreamDestroy(sub_streams[i]));
+      }
+      {% endif %}
+      {% if n_additional_events > 0 %}
+      for (size_t i = 0; i < N_SUB_EVENTS; i++) {
+        DEVICE_CHECK(DestroyEvent(sub_events[i]));
+      }
+      DEVICE_CHECK(DestroyEvent(sub_event_base));
+      {% endif %}
     }
 
-    Model& operator=(Model&&) = delete;
-    Model(const Model&) = delete;
-    Model& operator=(const Model&) = delete;
-
     void SetUpInputsOutputs() {
         {{ set_inputs }}
     }
 
+    void ResetConstants(uint8_t* constants) {
+        /*
+         * This can be called if we want to use a different piece of memory
+         * for the constants to be consumed.
+         */
+        {{ reset_constants }}
+    }
+
     void DeviceToDeviceCopies(StreamType stream) {
   {{ device_to_device_copies }}
     }
-    void Run(StreamType stream, bool graph_mode) {
-      SetUpInputsOutputs();
-      if (target_has_graph_mode && graph_mode) {
-        RunAsGraph(stream);
-      } else {
-        RunImpl(stream);
-      }
-      DEVICE_CHECK(EventRecord(run_finished_, stream));
-    }
 
+{% if run_impl_mode == 0 %}
+    ///////////////////////////////////////////////////////////////////////////
+    // default RunImpl implemenation
     void RunImpl(StreamType stream) {
+        {% if profiler_annotation %}
+        RAII_ProfilerRange _raiiAITProfilerRange("main_start");
+        {% endif %}
   {% for func in function_seq %}
   {{ func }}
       DeviceCheckLastError(__FILE__, __LINE__);
   {% endfor %}
-      DeviceToDeviceCopies(stream);
-    }
-
-    bool IsPending() {
-      auto query = QueryEvent(run_finished_);
-      if (query == GetDeviceNotReady()) {
-        return true;
-      }
-      if (query != GetDeviceSuccess()) {
-        LOG(WARNING) << "Pending model run did not finish successfully. Error: "
-                    << GetErrorString(query);
-      }
-      return false;
-    }
-
-    void WaitForCompletion() {
-      DEVICE_CHECK(EventSynchronize(run_finished_));
-    }
-
-    size_t NumInputs() const {
-      return num_inputs_;
-    }
-
-    size_t NumOutputs() const {
-      return num_outputs_;
-    }
-
-    void SetParam(const void* src, size_t param_idx) {
-      CHECK_VECTOR_ACCESS(params_, param_idx)
-      // const_cast is not ideal here, but it is unfortunately
-      // necessary:
-      // 1) We store outputs and inputs in the same vector,
-      //    and outputs cannot be const.
-      // 2) Most of the codegen is not const-correct (most ops
-      //    require non-const pointers). So even if we put const
-      //    pointers into params, a const_cast would be required
-      //    somewhere else.
-      params_[param_idx].ptr = const_cast<void*>(src);
-    }
-
-    void SetInput(const void* src, const AITemplateParamShape& shape, size_t idx) {
-      SetInputShape(shape, idx);
-      SetParam(src, idx);
-    }
-
-    void SetOutput(void* src, size_t idx) {
-      SetParam(src, idx + num_inputs_);
     }
+{% endif %}
 
-    // Write the (possibly dynamic) output shape to the given pointer.
-    // Note that this should be called _after_ the shape inference in
-    // Run() is finished. output_shape_out should be able to store
-    // at least GetOutputMaximumShape(idx).size values.
-    void GetOutputShape(size_t idx, int64_t* output_shape_out) {
-      const auto param_idx = idx + num_inputs_;
-      CHECK_VECTOR_ACCESS(params_, param_idx);
-      const auto& shape_ptrs = params_[param_idx].shape_ptrs;
-      for (size_t i = 0; i < shape_ptrs.size(); ++i) {
-        output_shape_out[i] = shape_ptrs[i].GetValue();
+{% if run_impl_mode == 1 %}
+    ///////////////////////////////////////////////////////////////////////////
+    // simple multistream implementation
+    void RunImpl(StreamType baseStream) {
+      {% if profiler_annotation %}
+      RAII_ProfilerRange _raiiAITProfilerRange("main_start");
+      {% endif %}
+
+      {% for funcs in par_function_seq %}
+        {% if funcs|length == 1 %}
+          // do no parallel stream processing here
+          {
+            uint8_t* global_workspace_ = this->global_workspace_;
+            uint8_t* unique_workspace_ = this->unique_workspace_;
+
+            StreamType& stream = baseStream;
+            {{ funcs[0] }}
+            DeviceCheckLastError(__FILE__, __LINE__);
+          }
+        {% else %}
+          // do parallel stream processing here
+          // first function runs on the base stream, others are on extra ones.
+          // it is assumed that functions are independent.
+          {
+            // baseStream fork guard
+            DEVICE_CHECK(EventRecord(sub_event_base, baseStream));
+
+            // every substream forks
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              StreamType& stream = sub_streams[i];
+              DEVICE_CHECK(StreamWaitEvent(stream, sub_event_base));
+            }
+
+            // run kernels
+            // note that every stream may run spawn multiple kernel runs
+            {% for func in funcs %}
+              {% if (loop.index - 1) % (n_additional_streams + 1) == 0 %}
+                {
+                  uint8_t* global_workspace_ = this->global_workspace_;
+                  uint8_t* unique_workspace_ = this->unique_workspace_;
+
+                  StreamType& stream = baseStream;
+                  {{ func }}
+                  DeviceCheckLastError(__FILE__, __LINE__);
+                }
+              {% else %}
+                {
+                  uint8_t* global_workspace_ = this->global_workspace_ + this->workspace_size_ / {{1 + n_additional_events}} * {{ ((loop.index - 1) % (n_additional_streams + 1)) }};
+                  uint8_t* unique_workspace_ = this->unique_workspace_ + this->unique_workspace_size_ / {{1 + n_additional_events}} * {{ ((loop.index - 1) % (n_additional_streams + 1)) }};
+
+                  StreamType& stream = sub_streams[{{ ((loop.index - 1) % (n_additional_streams + 1)) - 1}}];
+                  {{ func }}
+                  DeviceCheckLastError(__FILE__, __LINE__);
+                }
+              {% endif %}
+            {% endfor %}
+
+            // substream join guards
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              DEVICE_CHECK(EventRecord(sub_events[i], sub_streams[i]));
+            }
+            // base stream joins
+            for (size_t i = 0; i < {{ n_additional_streams if funcs|length > n_additional_streams else funcs|length - 1 }}; i++) {
+              DEVICE_CHECK(StreamWaitEvent(baseStream, sub_events[i]));
+            }
+          }
+        {% endif %}
+      {% endfor %}
+
+      {
+        // run various checks, if needed
+        StreamType& stream = baseStream;
+        {% for func in par_check_function_seq %}
+          {{ func }}
+          DeviceCheckLastError(__FILE__, __LINE__);
+        {% endfor %}
       }
     }
+{% endif %}
 
-    void SetConstant(const char* name, const void* src) {
-      auto it = constant_name_to_ptr_.find(name);
-      if (it == constant_name_to_ptr_.end()) {
-        throw std::out_of_range(std::string("Could not find constant ") + name);
+    void ProfileImpl(StreamType stream, size_t iters, const std::string& filename) {
+#ifdef OPTIMIZE_FOR_COMPILATION_TIME
+      throw std::runtime_error("Profile is disabled, please recompile without OPTIMIZE_FOR_COMPILE_TIME flag");
+#else
+      std::ofstream ss(filename);
+      if (!ss) {
+        throw std::runtime_error(std::string("Could not open file ") + filename);
       }
-      const void** ptr = it->second;
-      *ptr = src;
-    }
-
-  private:
-    void InitConstants(uint8_t* constants) {
-      {{ set_up_constants }}
-      {{ set_up_param_dynamic_shapes }}
-    }
 
-    void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
-      auto& param = params_[idx];
-      if (shape.size != param.shape_ptrs.size()) {
-        throw std::runtime_error(
-          "[SetInputShape] Got wrong param shape for input " + std::to_string(idx) +
-          "; expected " + std::to_string(param.shape_ptrs.size()) + ", got " +
-          std::to_string(shape.size));
-      }
-      for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
-        param.shape_ptrs[i].SetValue(shape.shape_data[i]);
+      int deviceId;
+      char* L2CacheSlab = nullptr;
+      DevicePropertyType deviceProperties;
+      GetDevice(&deviceId);
+      GetDeviceProperties(&deviceProperties, deviceId);
+      const size_t L2SizeInBytes = deviceProperties.l2CacheSize;
+      DeviceMalloc((void**) &L2CacheSlab, L2SizeInBytes);
+
+      ss << "{\\n";
+      {% for func_name, func, input_sizes, output_sizes, func_properties in per_op_profiler_seq %}
+      {
+        std::cout << "Profiling: " << "{{ func_name }}" << " (" << iters << " iterations)" << std::endl;
+        std::vector<std::pair<EventType, EventType>> call_events(iters);
+        for (auto& [call_start, call_end] : call_events) {
+          CreateEvent(&call_start);
+          CreateEvent(&call_end);
+        }
+        for (auto& [call_start, call_end] : call_events) {
+          DeviceMemset(L2CacheSlab, 0x73, L2SizeInBytes);
+          EventRecord(call_start, stream);
+            {{ func }}
+          EventRecord(call_end, stream);
+          DeviceCheckLastError(__FILE__, __LINE__);
+        }
+        EventSynchronize(std::get<1>(call_events.back()));
+        float milliseconds = 0.0;
+        for (auto& [call_start, call_end] : call_events) {
+          float call_milliseconds = 0.0;
+          EventElapsedTime(&call_milliseconds, call_start, call_end);
+          DestroyEvent(call_start);
+          DestroyEvent(call_end);
+          milliseconds += call_milliseconds;
+        }
+        ss << "\\"" << "{{ func_name }}" << "\\": { \\"ms_per_iter\\": "
+           << std::setprecision(4) << (milliseconds/iters)
+           << ", \\"qps\\": " << 1000 * iters / milliseconds
+           << ", \\"input_sizes\\": " << "{{ input_sizes | replace("'", '\\\\"') }}"
+           << ", \\"output_sizes\\": " << "{{ output_sizes | replace("'", '\\\\"') }}"
+        {% for prop_name, prop_value in func_properties.items() %}
+          << ", \\"{{ prop_name }}\\": " << "\\"{{ prop_value }}\\""
+        {% endfor %}
+           << " } ";
+        {% if loop.last %}
+          ss << "\\n";
+        {% else %}
+          ss << ",\\n";
+        {% endif %}
       }
-    }
+      {% endfor %}
+      ss << "}\\n";
 
-    DeviceError EndCapture(GraphType* graph_ptr) {
-      auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
-      if (err != GetDeviceSuccess()) {
-        // If we can't take the stream out of capture mode, something is probably
-        // wrong with CUDA graph for this model (e.g. there might have been an
-        // illegal capture mode operation). Disable graph mode to avoid such issues
-        // in future iterations.
-        target_has_graph_mode = false;
-        LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
-        return err;
-      }
-      return GetDeviceSuccess();
+      DeviceToDeviceCopies(stream);
+      std::cout << "AIT per op profiling finished." << std::endl;
+      FreeDeviceMemory(L2CacheSlab);
+#endif
     }
 
-    void RunAsGraph(StreamType stream) {
-      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
-      try {
-        RunImpl(graph_capture_stream_);
-      } catch (...) {
-        GraphType graph;
-        // No need to DEVICE_CHECK here, we want to see the original exception.
-        EndCapture(&graph);
-        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
-          LOG(WARNING) << "Graph destruction failed while handling exception! Memory will be leaked.";
-        }
-        throw;
-      }
-
-      // The following function ends the capture and creates a graph
-      // inside a unique_ptr that cleans up it when it goes out of scope.
-      // Note that it throws an exception if EndCapture fails.
-      auto graph = RAII_EndCaptureAndCreateGraph(
-        [this](GraphType* graph_ptr){ return EndCapture(graph_ptr); }
+    static std::unique_ptr<{{model_name}}> Create(
+      AITemplateAllocator& allocator,
+      uint8_t* constants
+    ) {
+      return std::make_unique<{{model_name}}>(
+          {{ blob_size }},
+          {{ workspace_size }} * (1 + {{n_additional_streams}}),
+          {{ unique_workspace_size }} * (1 + {{n_additional_streams}}),
+          {{ num_inputs }},
+          {{ num_outputs }},
+          {{ num_unbound_constants }},
+          constants,
+          allocator
       );
-
-      if (graph_exec_ == nullptr) {
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      } else if (GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
-        // Consume the last cuda error, which may affect the next GraphExecLaunch
-        // call.
-        GetLastError();
-        DEVICE_CHECK(GraphExecDestroy(graph_exec_));
-        DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
-      }
-
-      DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
     }
 
-    int device_idx_;
-    int max_smem_size_{0};
-    DevicePropertyType device_properties_;
-    // This event tracks when the inference is finished
-    // so that this Model may be reclaimed by its owning
-    // ModelContainer.
-    EventType run_finished_;
-    // A blob of memory used for storing intermediate tensors.
-    GPUPtr blob_;
-    // Memory for constants that were folded into the *.so. Unowned by Model,
-    // owned by ModelContainer.
-    // TODO: make this const. It can't be const right now because we derive
-    // tensor pointers from it, and no tensor pointers are const.
-    uint8_t* constants_;
-    size_t num_inputs_;
-    size_t num_outputs_;
-
-    // The workspace blob is used as scratch memory. See
-    // _generate_workspace in memory planning for more information.
-    GPUPtr workspace_;
-    uint8_t* global_workspace_{nullptr};
-    uint8_t* unique_workspace_{nullptr};
-
-    class ParamDim {
-      public:
-        ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value) :
-          lower_bound_(lower_bound),
-          upper_bound_(upper_bound),
-          value_(value) {}
-
-        void SetValue(int64_t new_value) {
-          if (new_value < lower_bound_ || new_value > upper_bound_) {
-            throw std::out_of_range(
-              "[SetValue] Dimension got value out of bounds; expected value to be in [" +
-              std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) + "], but got " +
-              std::to_string(new_value)
-            );
-          }
-          *value_ = new_value;
-        }
-
-        int64_t GetValue() const {
-          return *value_;
-        }
-
-      private:
-        int64_t lower_bound_;
-        int64_t upper_bound_;
-        int64_t* value_;
-    };
-
-    struct ParamInfo {
-      void* ptr = nullptr;
-      // TODO add offset
-      const char* name;
-      std::vector<ParamDim> shape_ptrs;
-    };
-
-    // Contains info for all tensors marked as inputs
-    // or outputs. The first num_inputs elements are the inputs.
-    // Constants are not included.
-    std::vector<ParamInfo> params_;
-
-    GraphExecType graph_exec_ = nullptr;
-    StreamType graph_capture_stream_;
-
-    std::unordered_map<std::string, const void**> constant_name_to_ptr_;
+  private:
 {{ tensor_decl }}
 {{ dim_decl }}
+{{ jagged_decl }}
 {{ function_state }}
 };
 } // namespace ait
@@ -378,6 +325,11 @@ class ParamDim {
 
 namespace ait {
 namespace {
+
+{% if is_windows %}
+#include "windll.h"
+{% endif %}
+
 // Contains the metadata for each constant.
 constexpr std::array<ConstantInfo, {{ num_constants }}> owned_constants = {
   {{ owned_constants_init }}
@@ -387,10 +339,17 @@ class ParamDim {
 ModelContainerBase::ModelContainerBase(
     size_t num_inputs,
     size_t num_outputs,
+    size_t num_bound_constants,
     size_t num_unbound_constants,
     size_t params_size,
     AITemplateAllocator& allocator)
-    : constants_(RAII_DeviceMalloc(params_size, allocator)),
+    : constants_size_(params_size),
+      constants_primary_(RAII_DeviceMalloc(constants_size_, allocator)),
+      constants_secondary_(nullptr),
+      use_constants_primary_buffer_(true),
+      buffer_state_(BufferState::CLEAN),
+      bound_constant_size_(num_bound_constants),
+      bound_constant_dtypes_(num_bound_constants),
       num_params_(num_inputs + num_outputs + num_unbound_constants),
       param_names_(num_params_),
       param_dtypes_(num_params_),
@@ -400,6 +359,8 @@ class ParamDim {
 {{ set_up_constant_names }}
 {{ set_up_param_names }}
 {{ set_up_param_dtypes }}
+{{ set_up_bound_constant_dtypes }}
+{{ set_up_bound_constant_size }}
 {{ set_up_output_shapes }}
   for (size_t i = 0; i < num_params_; ++i) {
     max_param_numel_[i] = std::accumulate(
@@ -410,21 +371,31 @@ class ParamDim {
     );
     max_param_storage_bytes_[i] = max_param_numel_[i] * AITemplateDtypeSizeBytes(param_dtypes_[i]);
   }
-
-  auto* constants_ptr = static_cast<uint8_t*>(constants_.get());
+{{ set_up_constant_offsets }}
+{{ set_up_constant_folding_inputs }}
+
+{% if is_windows %}
+  size_t binary_constants_bin_size = 0;
+  uint8_t* binary_constants_bin_start = nullptr;
+  GetConstantsBin((void**)&binary_constants_bin_start, &binary_constants_bin_size);
+{% else %}
   const auto binary_constants_bin_size = static_cast<size_t>(_binary_constants_bin_end - _binary_constants_bin_start);
+  const uint8_t* const binary_constants_bin_start = _binary_constants_bin_start;
+{% endif %}
+
+  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
   for (auto& constant_info : owned_constants) {
     auto* dst = constants_ptr + constant_info.internal_offset;
     if (constant_info.data_offset + constant_info.num_bytes > binary_constants_bin_size) {
       throw std::runtime_error(std::string("Copying constant ") + constant_info.name + " would overflow constant buffer");
     }
-    DEVICE_CHECK(CopyToDevice(dst, _binary_constants_bin_start + constant_info.data_offset, constant_info.num_bytes));
+    DEVICE_CHECK(CopyToDevice(dst, binary_constants_bin_start + constant_info.data_offset, constant_info.num_bytes));
   }
 }
 
 ModelContainer* CreateModelContainer(size_t num_runtimes, AITemplateAllocator& allocator) {
-  // num_runtimes, blob_size, workspace_size, num_inputs, num_outputs, num_unbound_constants, param_size, allocator
-  return new ModelContainer(num_runtimes, {{blob_size}}, {{workspace_size}}, {{num_inputs}}, {{num_outputs}}, {{num_unbound_constants}}, {{param_size}}, allocator);
+  // num_runtimes, num_inputs, num_outputs, num_bound_constants, num_unbound_constants, params_size, allocator
+  return new ModelContainer(num_runtimes, {{num_inputs}}, {{num_outputs}}, {{num_bound_constants}}, {{num_unbound_constants}}, {{param_size}}, allocator);
 }
 } // namespace ait
 """
diff --git a/python/aitemplate/backend/profiler_cache.py b/python/aitemplate/backend/profiler_cache.py
index 4d8cd5e36..e2170ba04 100644
--- a/python/aitemplate/backend/profiler_cache.py
+++ b/python/aitemplate/backend/profiler_cache.py
@@ -16,17 +16,19 @@
 SQLite backend for conv/gemm profiling cache
 """
 import enum
+import logging
 import sqlite3
 
 from typing import Any, Dict, Tuple
 
 import jinja2
 
-from ..utils import logger
-
 # pylint: disable=W0613
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class CacheMode(enum.Enum):
     r"""Enum for cache mode
 
@@ -131,7 +133,7 @@ class CacheMode(enum.Enum):
 
 CONV_INIT_TEMPLATE = jinja2.Template(
     """
- CREATE TABLE IF NOT EXISTS {{dev}}_conv (
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv_{{version}} (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
   exec_entry VARCHAR(8192) NOT NULL,
   exec_entry_sha1 VARCHAR(64) NOT NULL,
@@ -145,9 +147,12 @@ class CacheMode(enum.Enum):
   kh INTEGER NOT NULL,
   kw INTEGER NOT NULL,
   co INTEGER NOT NULL,
-  stride INTEGER NOT NULL,
-  pad INTEGER NOT NULL,
-  dilate INTEGER NOT NULL,
+  strideh INTEGER NOT NULL,
+  padh INTEGER NOT NULL,
+  dilateh INTEGER NOT NULL,
+  stridew INTEGER NOT NULL,
+  padw INTEGER NOT NULL,
+  dilatew INTEGER NOT NULL,
   op_type VARCHAR(512) NOT NULL,
   epilogue VARCHAR(512) NOT NULL,
   device VARCHAR(16) NOT NULL,
@@ -164,7 +169,7 @@ class CacheMode(enum.Enum):
 CONV_QUERY_TEMPLATE = jinja2.Template(
     """
 SELECT algo, workspace
-FROM {{dev}}_conv
+FROM {{dev}}_conv_{{version}}
 WHERE
 dtype_a={{dtype_a}} AND
 dtype_b={{dtype_b}} AND
@@ -176,9 +181,12 @@ class CacheMode(enum.Enum):
 kh={{kh}} AND
 kw={{kw}} AND
 co={{co}} AND
-stride={{stride}} AND
-pad={{pad}} AND
-dilate={{dilate}} AND
+strideh={{strideh}} AND
+padh={{padh}} AND
+dilateh={{dilateh}} AND
+stridew={{stridew}} AND
+padw={{padw}} AND
+dilatew={{dilatew}} AND
 op_type='{{op_type}}' AND
 device='{{device}}' AND
 epilogue={{epilogue}} AND
@@ -189,7 +197,7 @@ class CacheMode(enum.Enum):
 
 CONV_INSERT_TEMPLATE = jinja2.Template(
     """
-INSERT INTO {{dev}}_conv (
+INSERT INTO {{dev}}_conv_{{version}} (
     exec_entry,
     exec_entry_sha1,
     dtype_a,
@@ -202,9 +210,12 @@ class CacheMode(enum.Enum):
     kh,
     kw,
     co,
-    stride,
-    pad,
-    dilate,
+    strideh,
+    padh,
+    dilateh,
+    stridew,
+    padw,
+    dilatew,
     op_type,
     epilogue,
     device,
@@ -225,9 +236,12 @@ class CacheMode(enum.Enum):
     {{kh}},
     {{kw}},
     {{co}},
-    {{stride}},
-    {{pad}},
-    {{dilate}},
+    {{strideh}},
+    {{padh}},
+    {{dilateh}},
+    {{stridew}},
+    {{padw}},
+    {{dilatew}},
     '{{op_type}}',
     {{epilogue}},
     '{{device}}',
@@ -240,7 +254,7 @@ class CacheMode(enum.Enum):
 
 CONV3D_INIT_TEMPLATE = jinja2.Template(
     """
- CREATE TABLE IF NOT EXISTS {{dev}}_conv3d (
+ CREATE TABLE IF NOT EXISTS {{dev}}_conv3d_{{version}} (
   id INTEGER PRIMARY KEY AUTOINCREMENT,
   exec_entry VARCHAR(8192) NOT NULL,
   exec_entry_sha1 VARCHAR(64) NOT NULL,
@@ -280,7 +294,7 @@ class CacheMode(enum.Enum):
 CONV3D_QUERY_TEMPLATE = jinja2.Template(
     """
 SELECT algo, workspace
-FROM {{dev}}_conv3d
+FROM {{dev}}_conv3d_{{version}}
 WHERE
 dtype_a={{dtype_a}} AND
 dtype_b={{dtype_b}} AND
@@ -312,7 +326,7 @@ class CacheMode(enum.Enum):
 
 CONV3D_INSERT_TEMPLATE = jinja2.Template(
     """
-INSERT INTO {{dev}}_conv3d (
+INSERT INTO {{dev}}_conv3d_{{version}} (
     exec_entry,
     exec_entry_sha1,
     dtype_a,
@@ -455,7 +469,14 @@ class CacheMode(enum.Enum):
 )
 
 
-class ProfileCacheDB(object):
+__AIT_CACHE_VERSION__ = 3
+
+
+def ait_cache_version() -> int:
+    return __AIT_CACHE_VERSION__
+
+
+class ProfileCacheDB:
     r"""Local SQLite profile cache database."""
 
     def __init__(
@@ -479,8 +500,12 @@ def __init__(
         self._mode = CacheMode.LOCAL
         self._db_commit_flag = False
         # Some design rationales:
-        #   * Each table maintains it own version number. This can avoid re-creating
-        #     tables that are not involved with the breaking changes.
+        #   * All tables share the version number, because we are exposing the
+        #     the cache version. Using a single version number seems to make it
+        #     more clean. One caveat is that we are going to re-create all tables
+        #     even if some of them are not involved with the breaking changes.
+        #     It seems to be fine as we expect the frequency of cache version
+        #     updated to be quite low.
         #   * We only keep a single table (i.e. version) for each category (
         #     gemm, conv and norm) to simplify how we handle breaking changes
         #     and rollbacks caused by failures in the updated version.
@@ -489,8 +514,10 @@ def __init__(
         #     We could choose the old working version upon rollback, but we might
         #     leave some content from the failing version in the db. How are we
         #     going to update the db if we update the version again, and so on.
-        # TODO: add similar version control for conv and norm
-        self._gemm_cache_version = 1
+        # TODO: add similar version control for norm
+        self._gemm_cache_version = ait_cache_version()
+        self._conv_cache_version = ait_cache_version()
+        self._conv3d_cache_version = ait_cache_version()
         if uri is not None:
             self._mode = CacheMode.REMOTE
         if self._mode == CacheMode.LOCAL:
@@ -508,33 +535,109 @@ def _init_db(self):
         self._create_conv3d_table()
         self._create_norm_table()
 
-    def get_profile_gemm_cache_version(self) -> int:
+    @property
+    def gemm_cache_version(self) -> int:
         return self._gemm_cache_version
 
+    @property
+    def conv_cache_version(self) -> int:
+        return self._conv_cache_version
+
+    @property
+    def conv3d_cache_version(self) -> int:
+        return self._conv3d_cache_version
+
     def _create_gemm_table(self):
         """Creates gemm table."""
-        if not self._gemm_table_version_matches():
-            logger.info(__name__, "temporarily keep old cache versions")
+        version = self.gemm_cache_version
+        if not self._table_exists("gemm", version):
+            _LOGGER.info(
+                "Temporarily keeping the old gemm cache versions if exist",
+            )
             # FIXME: will delete unmatched version once we get into production
             # self._delete_existing_table("gemm")
 
-        logger.info(
-            __name__,
-            f"Trying to make a new gemm table with {self._gemm_cache_version=}",
-        )
-        sql = GEMM_INIT_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version
-        )
+            _LOGGER.info(
+                f"Creating a new gemm table with {version=}",
+            )
+            sql = GEMM_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_conv_table(self):
+        """Creates conv table."""
+        version = self.conv_cache_version
+        if not self._table_exists("conv", version):
+            _LOGGER.info(
+                "Temporarily keeping the old conv cache versions if exist",
+            )
+            # FIXME: will delete unmatched version once we get into production
+            # self._delete_existing_table("conv")
+
+            _LOGGER.info(
+                f"Creating a new conv table with {version=}",
+            )
+            sql = CONV_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_conv3d_table(self):
+        """Creates conv3d table."""
+        version = self.conv3d_cache_version
+        if not self._table_exists("conv3d", version):
+            _LOGGER.info(
+                "Temporarily keeping the old conv3d cache versions if exist",
+            )
+            # FIXME: will delete unmatched version once we get into production
+            # self._delete_existing_table("conv3d")
+
+            _LOGGER.info(
+                f"Creating a new conv3d table with {version=}",
+            )
+            sql = CONV3D_INIT_TEMPLATE.render(
+                dev=self._target,
+                version=version,
+            )
+            self._cur.execute(sql)
+            self._con.commit()
+
+    def _create_norm_table(self):
+        """Creates conv table."""
+        sql = NORM_INIT_TEMPLATE.render(dev=self._target)
         self._cur.execute(sql)
         self._con.commit()
 
+    def _table_exists(self, table_kind, cache_version):
+        """Check if the table of given kind and cache version exists."""
+        table_name = f"{self._target}_{table_kind}_{cache_version}"
+        sql = CHECK_TABLE_EXISTENCE_TEMPLATE.render(table_name=table_name)
+        self._cur.execute(sql)
+        tables = self._cur.fetchall()
+
+        if tables:
+            _LOGGER.info(
+                f"{table_name=} exists in the db",
+            )
+            return True
+        else:
+            _LOGGER.info(
+                f"{table_name=} does not exist in the db, possible version mismatch!",
+            )
+            return False
+
     def _delete_existing_table(self, table_kind):
         """Delete an existing table in the db"""
         sql = QUERY_ALL_TABLES_TEMPLATE.render()
         self._cur.execute(sql)
         all_tables = self._cur.fetchall()
         if len(all_tables) == 0:
-            logger.info(__name__, "deleting table: skip empty table")
+            _LOGGER.info("deleting table: skip empty table")
             return
 
         target_tables = [
@@ -547,45 +650,9 @@ def _delete_existing_table(self, table_kind):
         assert (
             len(target_tables) == 1
         ), f"expected only one {table_kind} table but got {target_tables=}"
-        logger.info(__name__, f"deleting table {target_tables[0]=}")
+        _LOGGER.info(f"deleting table {target_tables[0]=}")
         self._cur.execute(f"DROP TABLE {target_tables[0]}")
 
-    def _create_conv_table(self):
-        """Creates conv table."""
-        sql = CONV_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _create_conv3d_table(self):
-        """Creates conv3d table."""
-        sql = CONV3D_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _create_norm_table(self):
-        """Creates conv table."""
-        sql = NORM_INIT_TEMPLATE.render(dev=self._target)
-        self._cur.execute(sql)
-        self._con.commit()
-
-    def _if_table_exists(self, table_name):
-        """check if a table exists"""
-        sql = CHECK_TABLE_EXISTENCE_TEMPLATE.render(table_name=table_name)
-        self._cur.execute(sql)
-        tables = self._cur.fetchall()
-        return len(tables) > 0
-
-    def _gemm_table_version_matches(self):
-        table_name = f"{self._target}_gemm_{self._gemm_cache_version}"
-        if self._if_table_exists(table_name):
-            logger.info(__name__, f"{table_name=} exists in the db")
-            return True
-        else:
-            logger.info(
-                __name__, f"{table_name=} does not exist in the db, version mismatch!"
-            )
-            return False
-
     def _query(self, sql: str) -> Tuple[str, int]:
         """a function to query op from cache
 
@@ -625,7 +692,9 @@ def query_gemm(self, args: Dict[str, Any]) -> Tuple[str, int]:
             profiling results
         """
         sql = GEMM_QUERY_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version, **args
+            dev=self._target,
+            version=self.gemm_cache_version,
+            **args,
         )
         return self._query(sql)
 
@@ -643,7 +712,11 @@ def query_conv(self, args: Dict[str, Any]) -> Tuple[str, int]:
         Tuple
             profiling results
         """
-        sql = CONV_QUERY_TEMPLATE.render(dev=self._target, **args)
+        sql = CONV_QUERY_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv_cache_version,
+            **args,
+        )
         return self._query(sql)
 
     def query_conv3d(self, args: Dict[str, Any]) -> Tuple[str, int]:
@@ -660,7 +733,11 @@ def query_conv3d(self, args: Dict[str, Any]) -> Tuple[str, int]:
         Tuple
             profiling results
         """
-        sql = CONV3D_QUERY_TEMPLATE.render(dev=self._target, **args)
+        sql = CONV3D_QUERY_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv3d_cache_version,
+            **args,
+        )
         return self._query(sql)
 
     def query_normalization(self, args: Dict[str, Any]) -> Tuple[str, int]:
@@ -696,7 +773,7 @@ def _insert(self, query_sql: str, insert_sql: str) -> None:
                 self._cur.execute(insert_sql)
                 self._db_commit_flag = True
             else:
-                logger.info(__name__, "Ignore repeat profile_record: " + query_sql)
+                _LOGGER.info("Ignore repeat profile_record: " + query_sql)
 
     def insert_gemm(self, args: Dict[str, Any]) -> None:
         """a function to insert gemm op epilogue into cache
@@ -708,7 +785,7 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
         """
         query_sql = GEMM_QUERY_TEMPLATE.render(
             dev=self._target,
-            version=self._gemm_cache_version,
+            version=self.gemm_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -724,7 +801,9 @@ def insert_gemm(self, args: Dict[str, Any]) -> None:
             exec_entry_sha1=args["exec_entry_sha1"],
         )
         insert_sql = GEMM_INSERT_TEMPLATE.render(
-            dev=self._target, version=self._gemm_cache_version, **args
+            dev=self._target,
+            version=self.gemm_cache_version,
+            **args,
         )
         self._insert(query_sql, insert_sql)
 
@@ -740,6 +819,7 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
         """
         query_sql = CONV_QUERY_TEMPLATE.render(
             dev=self._target,
+            version=self.conv_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -750,16 +830,23 @@ def insert_conv(self, args: Dict[str, Any]) -> None:
             kh=args["kh"],
             kw=args["kw"],
             co=args["co"],
-            stride=args["stride"],
-            pad=args["pad"],
-            dilate=args["dilate"],
+            strideh=args["strideh"],
+            padh=args["padh"],
+            dilateh=args["dilateh"],
+            stridew=args["stridew"],
+            padw=args["padw"],
+            dilatew=args["dilatew"],
             op_type=args["op_type"],
             device=args["device"],
             epilogue=args["epilogue"],
             split_k=args["split_k"],
             exec_entry_sha1=args["exec_entry_sha1"],
         )
-        insert_sql = CONV_INSERT_TEMPLATE.render(dev=self._target, **args)
+        insert_sql = CONV_INSERT_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv_cache_version,
+            **args,
+        )
         self._insert(query_sql, insert_sql)
 
     def insert_conv3d(self, args: Dict[str, Any]) -> None:
@@ -774,6 +861,7 @@ def insert_conv3d(self, args: Dict[str, Any]) -> None:
         """
         query_sql = CONV3D_QUERY_TEMPLATE.render(
             dev=self._target,
+            version=self.conv3d_cache_version,
             dtype_a=args["dtype_a"],
             dtype_b=args["dtype_b"],
             dtype_c=args["dtype_c"],
@@ -800,7 +888,11 @@ def insert_conv3d(self, args: Dict[str, Any]) -> None:
             split_k=args["split_k"],
             exec_entry_sha1=args["exec_entry_sha1"],
         )
-        insert_sql = CONV3D_INSERT_TEMPLATE.render(dev=self._target, **args)
+        insert_sql = CONV3D_INSERT_TEMPLATE.render(
+            dev=self._target,
+            version=self.conv3d_cache_version,
+            **args,
+        )
         self._insert(query_sql, insert_sql)
 
     def insert_normalization(self, args: Dict[str, Any]) -> None:
diff --git a/python/aitemplate/backend/profiler_runner.py b/python/aitemplate/backend/profiler_runner.py
index 3fc5fa3b4..66ef26c92 100644
--- a/python/aitemplate/backend/profiler_runner.py
+++ b/python/aitemplate/backend/profiler_runner.py
@@ -17,28 +17,37 @@
 """
 from __future__ import annotations
 
-import concurrent
+import concurrent.futures
+import logging
 import os
 
 import re
 import subprocess
-import typing
 from collections import namedtuple
 from queue import Queue
-from typing import Callable, List, Tuple
+from time import sleep
+from typing import Callable, List, Tuple, Union
 
-from ..utils import logger
-from .target import Target
-from .task_runner import BaseRunner, Task
+from aitemplate.backend.target import Target
+from aitemplate.backend.task_runner import BaseRunner, Task
+
+from aitemplate.testing import detect_target
 
 # pylint: disable=W0221
 
+
+_LOGGER = logging.getLogger(__name__)
+
+
 PROF_RUNTIME_PATTERN = re.compile(r"OP:([a-zA-Z0-9_]+),TIME:([\d\.]+),WS:([\d]+)")
 # FIXME: We will remove the following two patterns once we implement the
 # same profiling mechanism as gemm for conv and amd
 RUNTIME_PATTERN = re.compile(r"TIME:([\d\.]+)")
 WORKSPACE_PATTERN = re.compile(r"WS:([\d]+)")
 
+PROFILER_RUN_MAX_ATTEMPTS = 3
+PROFILER_RUN_RETRY_DELAY_SECONDS = 5
+
 ProfileResult = namedtuple("ProfileResult", "op_config duration workspace")
 """Object to store profiling result
 """
@@ -48,27 +57,50 @@ def optimization_key(result):
     return float(result[1])
 
 
-def extract_profile_result(stdout) -> Tuple[ProfileResult, bool]:
+def extract_profile_result(
+    stdout,
+    return_ops=None,
+) -> Tuple[Union[ProfileResult, List[ProfileResult]], bool]:
     failed = False
     try:
         runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
         if len(runtimes) > 0:
-            logger.debug(__name__, f"all runtimes (unsorted): {runtimes}")
+            _LOGGER.debug(f"all runtimes (unsorted): {runtimes}")
             # format - OP:xx,TIME:x.xx,WS:xx
-            best_runtime = min(runtimes, key=optimization_key)
-            op_config = best_runtime[0]
-            duration = float(best_runtime[1])
-            workspace = int(best_runtime[2])
+            if return_ops is not None:
+                _LOGGER.debug(f"return ops: {return_ops}")
+                return_ops = set(return_ops)
+                result = [
+                    ProfileResult(
+                        op_config=runtime[0],
+                        duration=float(runtime[1]),
+                        workspace=int(runtime[2]),
+                    )
+                    for runtime in runtimes
+                    if runtime[0] in return_ops
+                ]
+            else:
+                best_runtime = min(runtimes, key=optimization_key)
+                result = ProfileResult(
+                    op_config=best_runtime[0],
+                    duration=float(best_runtime[1]),
+                    workspace=int(best_runtime[2]),
+                )
         else:
             # FIXME: remove it once we unify our profiling mechanism for conv and amd
-            op_config = ""
-            duration = float(RUNTIME_PATTERN.findall(stdout)[0])
-            workspace = int(WORKSPACE_PATTERN.findall(stdout)[0])
+            result = ProfileResult(
+                op_config="",
+                duration=float(RUNTIME_PATTERN.findall(stdout)[0]),
+                workspace=int(WORKSPACE_PATTERN.findall(stdout)[0]),
+            )
     except Exception:
-        duration = float("inf")
-        workspace = 0
+        result = ProfileResult(
+            op_config="",
+            duration=float("inf"),
+            workspace=0,
+        )
         failed = True
-    return ProfileResult(op_config, duration, workspace), failed
+    return result, failed
 
 
 def update_inplace(d, new_d):
@@ -90,37 +122,37 @@ def process_task(task: Task) -> None:
 
     if len(stderr) > 0:
         # TODO: ugly fix, should remove when finish all profiler refactor
-        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
-        if len(runtimes) > 0:
-            single_file_profiler = True
-        if not single_file_profiler:
-            task._failed = True
-            return
-        cmd = task._cmd
-        if Target.current().name() == "rocm":
-            cmd = " ".join(cmd)
-        cmd = task._cmd
-        if Target.current().name() == "rocm":
-            cmd = " ".join(cmd)
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "Failed: [{name}][{algo}]\ncmd:\n{cmd}\nstderr:\n{stderr}".format(
                 name=task._name,
                 algo=task._idx,
-                cmd=cmd,
+                cmd=task._cmd,
                 stderr=stderr,
             ),
         )
-    task._ret, task._failed = extract_profile_result(stdout)
+        runtimes = PROF_RUNTIME_PATTERN.findall(stdout)
+        if len(runtimes) > 0:
+            single_file_profiler = True
+        if not single_file_profiler:
+            task._failed = True
+            return
+
+    task._ret, task._failed = extract_profile_result(
+        stdout=stdout,
+        return_ops=task._kwargs.get("return_ops", None),
+    )
     if not task._failed:
-        logger.debug(
-            __name__,
-            f"Successful: [{task._name}][{task._idx}]: OP: {task._ret.op_config} "
-            f"TIME: {task._ret.duration} WS:{task._ret.workspace}",
-        )
+        results = task._ret
+        if not isinstance(results, list):
+            results = [results]
+        for result in results:
+            _LOGGER.debug(
+                f"Successful: [{task._name}][{task._idx}]: OP: {result.op_config} "
+                f"TIME: {result.duration} WS:{result.workspace}",
+            )
 
 
-def process_return(task: Task) -> typing.Tuple[typing.Union[int, str], ProfileResult]:
+def process_return(task: Task) -> Tuple[Union[int, str], ProfileResult]:
     """Generate profile result from a profiling task
 
     Parameters
@@ -141,16 +173,14 @@ class Runner(BaseRunner):
     Runner is inherited from BaseRunner.
     """
 
-    def __init__(self, devs: list[int], op_name: str, timeout: int = 30):
-        logger.info(
-            __name__, "Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name)
-        )
+    def __init__(self, devs: List[int], op_name: str, timeout: int = 30):
+        _LOGGER.info("Using {n} GPU for profiling {op}".format(n=len(devs), op=op_name))
         super().__init__(devs, op_name, timeout)
         self._dev_flag = Target.current().dev_select_flag()
         self._ftask_proc = process_task
         self._fret_proc = process_return
 
-    def push(self, idx: typing.Union[int, str], cmd: str):
+    def push(self, idx: Union[int, str], cmd: str, return_ops: List[str] = None):
         """Push a new profiling task into runner's queue
 
         Parameters
@@ -159,15 +189,27 @@ def push(self, idx: typing.Union[int, str], cmd: str):
             Profiling task id (usually is algorithm id or name)
         cmd : str
             Bash command to execute the profiling task
+        return_ops : List[str]
+            Names of the ops to return the profiling results for. If specified,
+            instead of a single (best) ProfileResult instance, a list with the
+            ProfileResults for each op in the return_ops is returned from `pull`.
         """
-        self._queue.append(Task(idx, cmd, self._tag, dev_flag=self._dev_flag))
+        self._queue.append(
+            Task(
+                idx,
+                cmd,
+                self._tag,
+                dev_flag=self._dev_flag,
+                return_ops=return_ops,
+            )
+        )
 
     def pull(self):
         """Pull results from all profiling tasks assigned to runner.
 
         Returns
         -------
-        list[Tuple[Union[int, str], ProfileResult]]
+        List[Tuple[Union[int, str], ProfileResult]]
             Profiling results of all successful tasks.
         """
         ret = super().pull(self._ftask_proc, self._fret_proc)
@@ -177,16 +219,31 @@ def pull(self):
 def run_task(cmds, queue, dev_select_flag):
     # get device or block until one is available
     device = queue.get()
-    logger.debug(__name__, f"running profiler {cmds=} on GPU #{device}")
-
-    completed_process = subprocess.run(
-        cmds,
-        env=update_inplace(os.environ.copy(), {dev_select_flag: device}),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-        shell=False,
-    )
+    _LOGGER.debug(f"running profiler {cmds=} on GPU #{device}")
+
+    attempts = 0
+    while True:
+        try:
+            completed_process = subprocess.run(
+                cmds,
+                env=update_inplace(os.environ.copy(), {dev_select_flag: device}),
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                shell=False,
+            )
+            break
+        except Exception as ex:
+            attempts += 1
+            if attempts >= PROFILER_RUN_MAX_ATTEMPTS:
+                raise
+            _LOGGER.debug(
+                f"[{attempts} / {PROFILER_RUN_MAX_ATTEMPTS}] "
+                f"Failed to run profiler {cmds=} due to exception: {ex}. "
+                f"Will retry in {PROFILER_RUN_RETRY_DELAY_SECONDS} seconds."
+            )
+            sleep(PROFILER_RUN_RETRY_DELAY_SECONDS)
+
     queue.put(device)
     return completed_process.stdout, completed_process.stderr
 
@@ -200,18 +257,19 @@ class ProfilerRunner:
     however, the results are empirically better compared to the previous runner.
     """
 
-    def __init__(self, devices: List[str], timeout: int, postprocessing_delegate):
+    def __init__(self, devices: List[str], postprocessing_delegate, timeout: int = 500):
         """
         Parameters
         ----------
         devices : List[str]
             device identifiers (contents of {CUDA,HIP}_VISIBLE_DEVICES)
-        timeout : int
-            timeout to wait for all profilers completion in seconds
         postprocessing_delegate :
             object responsible for postprocessing results after futures completion
+        timeout : int
+            timeout to wait for all profilers completion in seconds
         """
-        if devices is None:
+        if not devices:
+            # devices is either None or empty list: use device 0
             devices = [0]
         # This queue is used to ensure only one task is executed on a device at a time
         self._device_queue = Queue()
@@ -219,12 +277,17 @@ def __init__(self, devices: List[str], timeout: int, postprocessing_delegate):
         self._done_queue = Queue()
         for d in devices:
             self._device_queue.put(str(d))
-        logger.info(__name__, f"Initialized profiler runner with devices: {devices}")
+        _LOGGER.info(f"Initialized profiler runner with devices: {devices}")
         self._timeout = timeout
         self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=len(devices))
         self._futures = []
+        self._cmds = []
         self._postprocessing_delegate = postprocessing_delegate
-        self._dev_select_flag = Target.current().dev_select_flag()
+        try:
+            target = Target.current()
+        except RuntimeError:
+            target = detect_target()
+        self._dev_select_flag = target.dev_select_flag()
 
     def push(self, cmds: List[str], process_result_callback: Callable):
         """
@@ -251,25 +314,26 @@ def push(self, cmds: List[str], process_result_callback: Callable):
         # they are launched asynchronously, in a separate thread,
         # some time after a future holding profiler result completes
         def callback_when_done(fut):
+            stdout = None
+            stderr = None
             try:
                 stdout, stderr = "", ""
                 stdout, stderr = fut.result()
                 profile_result, err = extract_profile_result(stdout)
                 if err:
-                    logger.debug(
-                        __name__,
+                    _LOGGER.debug(
                         f"Profiler failure!\nProfiler stdout: {stdout}\nProfiler stderr: {stderr}",
                     )
-                    logger.debug(
-                        __name__, f"Failed to extract profiler result for {cmds}"
-                    )
+                    raise RuntimeError(f"Failed to extract profiler result for {cmds}")
                 process_result_callback(profile_result, self._postprocessing_delegate)
             finally:
                 # unblock one future in `join()`
-                self._done_queue.put(stdout)
+                if stdout is not None:
+                    self._done_queue.put(stdout)
 
         future.add_done_callback(callback_when_done)
         self._futures.append(future)
+        self._cmds.append(cmds)
 
     def join(self):
         """
@@ -277,10 +341,19 @@ def join(self):
         """
         done, not_done = concurrent.futures.wait(self._futures, self._timeout)
         for f in not_done:
+            # attempts cancelling, will fail if call is being executed or has finished
             f.cancel()
-        # block until each done_callback completes,
-        # or raise Empty exception after 3 minutes of waiting
-        block_timeout = 360 if Target.current().name() == "rocm" else 180
-        for _ in self._futures:
-            self._done_queue.get(timeout=block_timeout)
+        cancelled_cmds = [
+            cmd for cmd, f in zip(self._cmds, self._futures) if f.cancelled()
+        ]
+        if cancelled_cmds:
+            raise RuntimeError(
+                f"Profiler timed out after {self._timeout} sec. "
+                "Try increasing the timeout. "
+                f"Cancelled profilers: {cancelled_cmds}"
+            )
+        for _ in [f for f in self._futures if f.done() or f.running()]:
+            # sync point between futures and queue.
+            # wait for callbacks to finish
+            self._done_queue.get(timeout=self._timeout)
         self._postprocessing_delegate.postprocess_results()
diff --git a/python/aitemplate/backend/registry.py b/python/aitemplate/backend/registry.py
index 62f4a10ee..42e0675bc 100644
--- a/python/aitemplate/backend/registry.py
+++ b/python/aitemplate/backend/registry.py
@@ -55,7 +55,9 @@ def func(args):
         If same key is founded in registry, will raise a RuntimeError
     """
     if func_name in BACKEND_FUNCTIONS:
-        raise RuntimeError("{name} funcion has been registered.".format(name=func_name))
+        raise RuntimeError(
+            "{name} funcion has already been registered.".format(name=func_name)
+        )
 
     def _do_reg(func):
         BACKEND_FUNCTIONS[func_name] = func
diff --git a/python/aitemplate/backend/rocm/__init__.py b/python/aitemplate/backend/rocm/__init__.py
index 76dc9290b..a687a1ee7 100644
--- a/python/aitemplate/backend/rocm/__init__.py
+++ b/python/aitemplate/backend/rocm/__init__.py
@@ -16,16 +16,18 @@
 """
 Rocm backend init.
 """
-from . import lib_template, target_def, utils
-from .common import *
-from .conv2d import *
-from .embedding import *
-from .gemm import *
-from .pool2d import *
-from .view_ops import *
-from .elementwise import *
-from .tensor import *
-from .normalization import softmax
-from .upsample import *
-from .vision_ops import *
-from .normalization import groupnorm, groupnorm_swish, layernorm
+from aitemplate.backend.rocm import lib_template, target_def, utils
+from aitemplate.backend.rocm.attention import *
+from aitemplate.backend.rocm.common import *
+from aitemplate.backend.rocm.conv2d import *
+from aitemplate.backend.rocm.embedding import *
+from aitemplate.backend.rocm.gemm import *
+from aitemplate.backend.rocm.pool2d import *
+from aitemplate.backend.rocm.view_ops import *
+from aitemplate.backend.rocm.elementwise import *
+from aitemplate.backend.rocm.tensor import *
+from aitemplate.backend.rocm.normalization import softmax
+from aitemplate.backend.rocm.upsample import *
+from aitemplate.backend.rocm.vision_ops import *
+from aitemplate.backend.rocm.padding import *
+from aitemplate.backend.rocm.normalization import groupnorm, groupnorm_swish, layernorm
diff --git a/python/aitemplate/backend/rocm/attention/__init__.py b/python/aitemplate/backend/rocm/attention/__init__.py
new file mode 100644
index 000000000..0be5c075d
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/__init__.py
@@ -0,0 +1,18 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.backend.rocm.attention import mem_eff_attention
+
+__all__ = ["mem_eff_attention"]
diff --git a/python/aitemplate/backend/rocm/attention/mem_eff_attention.py b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
new file mode 100644
index 000000000..f902792c1
--- /dev/null
+++ b/python/aitemplate/backend/rocm/attention/mem_eff_attention.py
@@ -0,0 +1,365 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+attention kernel codegen for ROCM.
+"""
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+# pylint: disable=C0301
+
+FUNC_CALL_INT32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<int*>({{name}})")
+
+FUNC_CALL_FP32_PARAM_TEMPLATE = jinja2.Template("reinterpret_cast<float*>({{name}})")
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "logging.h"
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using MaskingSpecialization = ck::tensor_operation::device::MaskingSpecialization;
+
+static constexpr auto MaskingSpec_default = 
+    MaskingSpecialization::MaskDisabled;
+static constexpr auto MaskingSpec_causal =
+    MaskingSpecialization::MaskOutUpperTriangle;
+
+using F32 = float;
+using InputType = {{elem_input_type}};
+
+using ADataType        = InputType;
+using B0DataType       = InputType;
+using B1DataType       = InputType;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = InputType;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+{% if is_causal %}
+        MaskingSpec_causal
+{% else %}
+        MaskingSpec_default
+{% endif %}
+    >;   
+
+{{func_signature}}
+{
+    bool input_permute = true;
+    bool output_permute = true;
+    
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{softmax_scale};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    std::vector<typename DeviceGemmInstance::ProblemDesc> problem_descs;
+
+    const char* q_ptr = reinterpret_cast<const char*>(q);
+    const char* k_ptr = reinterpret_cast<const char*>(k);
+    const char* v_ptr = reinterpret_cast<const char*>(v);
+    char* output_ptr = reinterpret_cast<char*>(output);
+
+    std::vector<const void*> q_ptrs;
+    std::vector<const void*> k_ptrs;
+    std::vector<const void*> v_ptrs;
+    std::vector<void*> output_ptrs;
+
+    for(int64_t i = 0; i < batch_size ; i++){
+        int M = seqlens[i];
+        int N = seqlens[i];
+        int K = head_dim;
+        int O = head_dim;
+        int G0 = 1;
+        int G1 = num_heads;
+
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K]
+                : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K]
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K]
+                : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K]
+
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O]
+                : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O]
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides =
+            output_permute
+                ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
+                : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
+
+        problem_descs.push_back({a_gs_ms_ks_lengths,
+                                 a_gs_ms_ks_strides,
+                                 b0_gs_ns_ks_lengths,
+                                 b0_gs_ns_ks_strides,
+                                 b1_gs_os_ns_lengths,
+                                 b1_gs_os_ns_strides,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides,
+                                 {},   // acc0_biases_gs_ms_ns_lengths
+                                 {},   // acc0_biases_gs_ms_ns_strides
+                                 {},   // acc1_biases_gs_ms_os_lengths
+                                 {}}); // acc1_biases_gs_ms_os_strides
+
+        auto offset = K * G1 * M * sizeof(InputType);
+        q_ptrs.push_back(reinterpret_cast<const void*>(q_ptr)); 
+        q_ptr += offset;                              
+        k_ptrs.push_back(reinterpret_cast<const void*>(k_ptr));   
+        k_ptr += offset;                            
+        v_ptrs.push_back(reinterpret_cast<const void*>(v_ptr));
+        v_ptr += offset;                               
+        output_ptrs.push_back(reinterpret_cast<void*>(output_ptr)); 
+        output_ptr += offset;                              
+    }
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(q_ptrs,
+                                      k_ptrs,
+                                      v_ptrs,
+                                      output_ptrs,
+                                      {}, // p_acc0_biases
+                                      {}, // p_acc1_biases
+                                      problem_descs,
+                                      a_element_op,
+                                      b0_element_op,
+                                      acc0_element_op,
+                                      b1_element_op,
+                                      c_element_op);
+
+    // specify workspace for problem_desc
+
+    gemm.SetWorkSpacePointer(&argument, workspace);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        LOG(FATAL) << "wrong! " << gemm.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
+    }
+
+    invoker.Run(argument, StreamConfig{stream, false});
+}
+    """
+)
+
+
+FUNC_SIGNATURE = jinja2.Template(
+    """
+void {{func_name}}(void* output,
+                   const void* q,
+                   const void* k,
+                   const void* v,
+                   const int* seqlens,
+                   const int max_seqlen,
+                   int64_t batch_size,
+                   int num_heads,
+                   int head_dim,
+                   float softmax_scale,
+                   void* workspace,
+                   hipStream_t stream)
+    """
+)
+
+FUNC_DECL = jinja2.Template(
+    """
+    {{func_signature}};
+    """
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}   {{output}}, {{q}}, {{k}}, {{v}}, {{seqlens}},
+{{indent}}    {{max_seqlen}}, {{batch_size}},
+{{indent}}    {{num_heads}},
+{{indent}}    {{head_dim}},
+{{indent}}    {{softmax_scale}},
+{{indent}}    global_workspace_,
+{{indent}}    stream /* default stream */
+{{indent}});
+    """
+)
+
+
+@registry.reg("rocm.mem_eff_attention.gen_function")
+def mem_eff_attention_gen_function(func_attrs: Dict[str, Any]) -> str:
+    """the function for generating attention kernel"""
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_lib_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    is_causal = func_attrs["causal"]
+    return FUNC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        is_causal=is_causal,
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]),
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_decl")
+def mem_eff_attention_gen_function_decl(func_attrs: Dict[str, Any]):
+    return FUNC_DECL.render(
+        func_signature=FUNC_SIGNATURE.render(func_name=func_attrs["name"]).strip()
+    )
+
+
+@registry.reg("rocm.mem_eff_attention.func_call")
+def mem_eff_attention_gen_function_call(func_attrs, indent="  "):
+    """the function for generating a function call for attention"""
+    assert len(func_attrs["outputs"]) == 1
+    assert len(func_attrs["inputs"]) in [4, 5]
+
+    output_name = func_attrs["outputs"][0]._attrs["name"]
+
+    q_name = func_attrs["inputs"][0]._attrs["name"]
+    k_name = func_attrs["inputs"][1]._attrs["name"]
+    v_name = func_attrs["inputs"][2]._attrs["name"]
+
+    seqlens_name = FUNC_CALL_INT32_PARAM_TEMPLATE.render(
+        name=func_attrs["inputs"][3]._attrs["name"]
+    )
+
+    q = func_attrs["inputs"][0]
+
+    batch_size = func_attrs["inputs"][3].shape()[0]._attrs["name"]
+    num_heads = q._attrs["shape"][1]._attrs["values"][0]
+    max_seqlen = q._attrs["shape"][0].upper_bound() // 16
+    head_dim = q._attrs["shape"][3]._attrs["values"][0]
+
+    softmax_scale = head_dim ** (-0.5)
+
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=output_name,
+        q=q_name,
+        k=k_name,
+        v=v_name,
+        seqlens=seqlens_name,
+        max_seqlen=max_seqlen,
+        batch_size=batch_size,
+        num_heads=num_heads,
+        head_dim=head_dim,
+        softmax_scale=softmax_scale,
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/common/__init__.py b/python/aitemplate/backend/rocm/common/__init__.py
index 50ab82434..3e6e5152f 100644
--- a/python/aitemplate/backend/rocm/common/__init__.py
+++ b/python/aitemplate/backend/rocm/common/__init__.py
@@ -16,4 +16,4 @@
 """
 ROCM Common module init
 """
-from .dummy_op import *
+from aitemplate.backend.rocm.common.dummy_op import *
diff --git a/python/aitemplate/backend/rocm/common/dummy_op.py b/python/aitemplate/backend/rocm/common/dummy_op.py
index e4342ff43..5cbea271a 100644
--- a/python/aitemplate/backend/rocm/common/dummy_op.py
+++ b/python/aitemplate/backend/rocm/common/dummy_op.py
@@ -18,7 +18,7 @@
 
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
 
 @registry.reg("rocm.size.gen_function")
diff --git a/python/aitemplate/backend/rocm/conv2d/__init__.py b/python/aitemplate/backend/rocm/conv2d/__init__.py
index b40cf91b0..ddcd3131c 100644
--- a/python/aitemplate/backend/rocm/conv2d/__init__.py
+++ b/python/aitemplate/backend/rocm/conv2d/__init__.py
@@ -15,7 +15,7 @@
 """
 ROCM conv2d init.
 """
-from . import (
+from aitemplate.backend.rocm.conv2d import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add,
diff --git a/python/aitemplate/backend/rocm/conv2d/common.py b/python/aitemplate/backend/rocm/conv2d/common.py
index be1a830fb..01e06f17d 100644
--- a/python/aitemplate/backend/rocm/conv2d/common.py
+++ b/python/aitemplate/backend/rocm/conv2d/common.py
@@ -22,7 +22,7 @@
 
 import jinja2
 
-from ...target import Target
+from aitemplate.backend.target import Target
 
 # pylint: disable=C0103,C0415,W0611,C0301
 
@@ -104,7 +104,7 @@
 
 HEADER_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 """
 )
 
@@ -119,8 +119,8 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
+
+
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -284,7 +284,7 @@
   const int64_t stride = std::stoi(argv[8]);
   const int64_t pad = std::stoi(argv[9]);
   const int64_t dilation = std::stoi(argv[10]);
-  const int64_t group = std::stoi(argv[11]);
+  const int64_t group = std::stoi(argv[14]);
 """
 )
 
@@ -296,7 +296,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
@@ -386,24 +386,6 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
@@ -452,13 +434,13 @@
   {{tensor_decl}}
   // TODO: random init
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   auto timer = new KernelTimerImpl();
   timer->Start();
-  for(int i = 0; i < 5; ++i) {
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
   timer->End();
@@ -570,7 +552,6 @@ def gen_profiler(
     src_template=SRC_TEMPLATE,
     prob_args_template=PROBLEM_ARGS_TEMPLATE,
 ):
-
     """Generates standalone executables for profiler.
 
     Parameters
@@ -601,9 +582,12 @@ def gen_profiler(
         w_dim0="out_ch",
         w_dim1="kernel_h",
         w_dim2="kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
     )
     file_pairs = []
     for op_name, op in op_instance.items():
@@ -681,7 +665,7 @@ def gen_profiler(
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
     conv2d_flag,
@@ -694,7 +678,7 @@ def gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -743,9 +727,12 @@ def gen_function(
         w_dim0="*out_ch",
         w_dim1="*kernel_h",
         w_dim2="*kernel_w",
-        stride="stride",
-        dilate="dilation",
-        pad="pad",
+        strideh="stride",
+        dilateh="dilation",
+        padh="pad",
+        stridew="stride",
+        dilatew="dilation",
+        padw="pad",
         div="/",
     )
     shape_save_func = shape_save_template.render(
@@ -769,7 +756,7 @@ def gen_function(
             problem_args=problem_args,
             is_profiler=False,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return src_template.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d.py b/python/aitemplate/backend/rocm/conv2d/conv2d.py
index c8191c19a..8c9df0f5f 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for conv2d.
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
@@ -70,7 +70,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -86,7 +86,7 @@ def conv2d_gen_function(
         Execution statements in main function.
     src_template : jinja2.Template
         Full main.cpp with headers, embedding all templates.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -103,7 +103,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
index ccbc265dd..91506f2f9 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for Conv2dBias: conv2d(w, x) + b
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
@@ -70,7 +70,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -80,7 +80,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -96,7 +96,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py
index 0de2fe75f..3c8f0e3ba 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add.py
@@ -22,7 +22,7 @@
 
 
 @registry.reg("rocm.conv2d_bias_add_identity.config")
-def conv2d_config(func_attrs, dtype="float16"):
+def conv2d_config(func_attrs):
     import ck_lib
 
     op_kind = ck_lib.library.Conv2dKind.GroupConv2dBiasRelu
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
index 06bc090ec..f5807f017 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_add_relu.py
@@ -17,14 +17,14 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 
 
 namespace ck {
@@ -97,7 +97,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_add_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -107,7 +107,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -123,7 +123,7 @@ def conv2d_gen_function(
     extra_code = EXTRA_CODE.render()
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_add_relu",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
index ddbcaecd3..b33561394 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_relu.py
@@ -15,8 +15,8 @@
 """
 ROCM codegen functions for conv2d_bias_relu.
 """
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
@@ -71,7 +71,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -81,7 +81,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -97,7 +97,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_relu",
diff --git a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
index e78512c80..c82822610 100644
--- a/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/conv2d/conv2d_bias_sigmoid.py
@@ -17,14 +17,14 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
 EXTRA_CODE = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 
 #include "ck/utility/data_type.hpp"
 
@@ -35,7 +35,7 @@
 struct AddSigmoid
 {
     template <typename T>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;\    
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;   
     template <>
     __host__ __device__ constexpr void
     operator()<float>(float& y, const float& x0, const float& x1) const
@@ -116,7 +116,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.conv2d_bias_sigmoid.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -126,7 +126,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -142,7 +142,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_sigmoid",
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
index a7d5ebf33..77659bcd3 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 EXTRA_CODE = jinja2.Template(
@@ -98,7 +98,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.transposed_conv2d.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -114,7 +114,7 @@ def conv2d_gen_function(
         Execution statements in main function.
     src_template : jinja2.Template
         Full main.cpp with headers, embedding all templates.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -131,7 +131,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "",
diff --git a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
index 0be5a94e6..a6c5a3bd9 100644
--- a/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/backend/rocm/conv2d/transposed_conv2d_bias_relu.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.conv2d import common
 
 # pylint: disable=C0103,C0415,W0613
 
@@ -80,7 +80,7 @@ def conv2d_gen_profiler(func_attrs, workdir, shape_template):
 @registry.reg("rocm.transposed_conv2d_bias_relu.gen_function")
 def conv2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -90,7 +90,7 @@ def conv2d_gen_function(
     ----------
     func_attrs : Dict
         Operation attributes.
-    exec_cond_remplate : jinja2.Template
+    exec_cond_template : jinja2.Template
         Generates if statement to execute kernel.
     shape_eval_template : jinja2.Template
         Generates shape calculation.
@@ -106,7 +106,7 @@ def conv2d_gen_function(
     """
     return common.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
         "bias_relu",
diff --git a/python/aitemplate/backend/rocm/elementwise/__init__.py b/python/aitemplate/backend/rocm/elementwise/__init__.py
index 0bf6e473f..4594bf9ec 100644
--- a/python/aitemplate/backend/rocm/elementwise/__init__.py
+++ b/python/aitemplate/backend/rocm/elementwise/__init__.py
@@ -15,6 +15,6 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import fused_elementwise
+from aitemplate.backend.rocm.elementwise import fused_elementwise
 
 __all__ = ["fused_elementwise"]
diff --git a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
index 177d84cd1..145e9e846 100644
--- a/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
+++ b/python/aitemplate/backend/rocm/elementwise/fused_elementwise.py
@@ -19,10 +19,10 @@
 import os
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import elementwise_common
-from ...target import Target
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import elementwise_common
+from aitemplate.backend.target import Target
 
 
 HEAD_TEMPLATE = """
@@ -35,9 +35,7 @@
 def fused_elementwise_gen_function(func_attrs: Dict[str, Any]) -> str:
     """Generates fused_elementwise function definition."""
 
-    custom_libs = Target.current().get_custom_libs(
-        os.path.dirname(__file__), "custom_math.h"
-    )
+    custom_libs = '#include "custom_math.h"'
     return elementwise_common.fused_elementwise_gen_function(
         func_attrs=func_attrs,
         custom_libs=custom_libs,
diff --git a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
index 110aea2d9..b41891226 100644
--- a/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
+++ b/python/aitemplate/backend/rocm/embedding/bert_embeddings.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """
-bert_embeddings kernel codegen for CUDA.
+bert_embeddings kernel codegen for ROCM.
 """
 
 import math
@@ -21,8 +21,8 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import ROCMSpec
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
 
 # pylint: disable=C0301
 
@@ -31,25 +31,30 @@
 #include "logging.h"
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 
 #define EMBEDDING_DIM {{embedding_dim}}
 
+using EmbElementwiseOperation = ck::tensor_operation::element_wise::AddAdd;
+using EmbType = {{elem_input_type}};
+using IndexType = {{index_type}};
+
 {{func_signature}}
 {
-  auto device_instance = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<{{elem_input_type}}, {{index_type}}, {{elem_input_type}}, {{elem_input_type}}, float, {{elem_input_type}}, 256, 1, 256, 1, EMBEDDING_DIM, 1, {{row_v_size}}, 3>{};
+  auto device_instance = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, EmbType, EmbType, float, EmbType, EmbElementwiseOperation, 256, 1, 256, 1, EMBEDDING_DIM, 1, {{row_v_size}}, 3>{};
   auto argument_ptr = device_instance.MakeArgumentPointer(output,
-                                                          word_embeddings,
-                                                          token_type_embeddings,
-                                                          position_embeddings,
-                                                          input_ids,
-                                                          token_type_ids,
-                                                          position_ids,
+                                                          {ck::type_convert<EmbType*>(word_embeddings),
+                                                          ck::type_convert<EmbType*>(token_type_embeddings),
+                                                          ck::type_convert<EmbType*>(position_embeddings)},
+                                                          {ck::type_convert<IndexType*>(input_ids),
+                                                          ck::type_convert<IndexType*>(token_type_ids),
+                                                          ck::type_convert<IndexType*>(position_ids)},
                                                           gamma,
                                                           beta,
-                                                          8,
                                                           EMBEDDING_DIM,
                                                           indices_num,
-                                                          eps);
+                                                          eps,
+                                                          EmbElementwiseOperation{});
   if(!device_instance.IsSupportedArgument(argument_ptr.get())){
     LOG(FATAL) << "wrong! " << device_instance.GetTypeString() << " with the specified compilation parameters does not support this Embedding problem.";
   }
@@ -128,9 +133,9 @@ def python_int_dtype_to_c_dtype(dtype):
 @registry.reg("rocm.bert_embeddings.gen_function")
 def bert_embeddings_gen_function(func_attrs: Dict[str, Any]) -> str:
     backend_spec = ROCMSpec()
-    elem_input_type = backend_spec.dtype_to_ck_type[
+    elem_input_type = backend_spec.dtype_to_lib_type(
         func_attrs["inputs"][3]._attrs["dtype"]
-    ]
+    )
     (
         input_ids,
         token_type_ids,
diff --git a/python/aitemplate/backend/rocm/gemm/__init__.py b/python/aitemplate/backend/rocm/gemm/__init__.py
index 830169f54..ba4594cd5 100644
--- a/python/aitemplate/backend/rocm/gemm/__init__.py
+++ b/python/aitemplate/backend/rocm/gemm/__init__.py
@@ -15,7 +15,7 @@
 """
 Rocm gemm init.
 """
-from . import (  # noqa: F401
+from aitemplate.backend.rocm.gemm import (  # noqa: F401
     bmm_ccr,
     bmm_ccr_add,
     bmm_crr,
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
index a606cc313..a691c2cda 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_ccr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import CCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import CCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_common.py b/python/aitemplate/backend/rocm/gemm/bmm_common.py
index 4f100985e..5de7014d7 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_common.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_common.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from . import common
+from aitemplate.backend.rocm.gemm import common
 
 EXTRA_SHAPE_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_crr.py b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
index 631d90c42..6e842652c 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_crr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_crr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import CRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import CRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
index e2c5dd49e..ee7784d42 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
index 9909b3e65..54c73b438 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rcr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, bmm_permute_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, bmm_permute_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
index 03fe9b0e9..aa3b68752 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
index bedbc90ba..6d4fc73fd 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_rrr_permute.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, bmm_permute_common, common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, bmm_permute_common, common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
index b896d013e..ac189f319 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm.py
@@ -19,9 +19,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 EXTRA_CODE = jinja2.Template(
     """
@@ -137,7 +137,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
diff --git a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
index 4013e5930..f19b5c058 100644
--- a/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/bmm_softmax_bmm_permute.py
@@ -24,10 +24,10 @@
 """
 import jinja2
 
-from ... import registry
-from ...common import gemm_common
-from . import bmm_common, common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.rocm.gemm import bmm_common, common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 INPUT_ADDR_CALCULATOR = jinja2.Template(
     """
@@ -146,7 +146,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // b0: index 1
diff --git a/python/aitemplate/backend/rocm/gemm/common.py b/python/aitemplate/backend/rocm/gemm/common.py
index b0e7e9e3a..da7d44ff3 100644
--- a/python/aitemplate/backend/rocm/gemm/common.py
+++ b/python/aitemplate/backend/rocm/gemm/common.py
@@ -22,8 +22,9 @@
 
 import jinja2
 
-from ...common import gemm_common
-from ...target import Target
+from aitemplate.backend.common import gemm_common
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar
 
 INPUT_ADDR_CALCULATOR = jinja2.Template(
     """
@@ -114,8 +115,7 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
+
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -209,6 +209,8 @@
 
 {% if gemm_flag == "bias_permute" %}
 {{indent}}                                static_cast<ck::half_t *>(bias_ptr),
+{% elif gemm_flag == "permute" %}
+{{indent}}                                nullptr,
 {% elif gemm_flag == "bias_permute_m2n3" %}
 {{indent}}                                std::array<const void*, 1>{static_cast<ck::half_t *>(bias_ptr)},
 {% elif gemm_flag == "permute_m2n3" %}
@@ -236,6 +238,9 @@
 {% if gemm_flag == "bias_permute" %}
 {{indent}}                                {M0, M1, M2, N0, N1, stride_D_M0, stride_D_M1, stride_D_M2, stride_D_N0, stride_D_N1},
 {{indent}}                                {M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1},
+{% elif gemm_flag == "permute" %}
+{{indent}}                                {},
+{{indent}}                                {M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1},
 {% elif gemm_flag in ["permute_m2n3", "bias_permute_m2n3", "bias_permute_m3n2"]  %}
 {{indent}}                                a_ms_ks_lengths,
 {{indent}}                                a_ms_ks_strides,
@@ -264,7 +269,7 @@
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{},
 {% if gemm_flag == "" %}
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
-{% elif gemm_flag == "permute_m2n3" %}
+{% elif gemm_flag in ["permute", "permute_m2n3"] %}
 {{indent}}                                ck::tensor_operation::element_wise::PassThrough{}
 {% elif gemm_flag == "bias" or "bias_permute" in gemm_flag %}
 {{indent}}                                ck::tensor_operation::element_wise::Add{}
@@ -312,7 +317,7 @@
   int64_t ptr_max_sz = std::max({a_ptr_sz, b_ptr_sz, c_ptr_sz});
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_max_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_max_sz)));
 
   memory_pool->AllocateHalfTensor(a_ptr_sz, mem_pool_sz);  // x: index 0
   memory_pool->AllocateHalfTensor(b_ptr_sz, mem_pool_sz);  // w: index 1
@@ -398,24 +403,7 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
+
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
@@ -463,13 +451,13 @@
   {{tensor_decl}}
   // TODO: random init
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   auto timer = new KernelTimerImpl();
   timer->Start();
-  for(int i = 0; i < 5; ++i) {
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
   timer->End();
@@ -646,6 +634,7 @@ def gen_profiler(
     file_pairs = []
     has_d0_flag = has_d0(func_attrs)
     has_d1_flag = has_d1(func_attrs)
+
     for op_name, op in op_instance.items():
         config = emit_instance(op)
         config_name = extract_config_name(config)
@@ -812,6 +801,13 @@ def gen_function(
             problem_args=problem_args,
             is_profiler=False,
         )
+        has_dynamic_shape = False
+        for inp in func_attrs["inputs"]:
+            for dim in inp.shape():
+                if isinstance(dim, IntVar) and (len(dim._attrs['values']) > 1):
+                    has_dynamic_shape = True
+        if has_dynamic_shape:
+            key = "true"
         exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     extra_header = extra_header_template.render(
@@ -992,4 +988,15 @@ def fproc_f16(op):
             c_layout=c_layout,
         )
 
+    has_dynamic_shape = False
+    for inp in func_attrs["inputs"]:
+        for dim in inp.shape():
+            if isinstance(dim, IntVar) and (len(dim._attrs['values']) > 1):
+                has_dynamic_shape = True
     func_attrs["op_instance"] = extract_config(op_kind, extra_kind, fproc_f16)
+    if has_dynamic_shape:
+        filtered_op_instance = {}
+        for op_name, op in func_attrs["op_instance"].items():
+            if "Padding" in emit_instance(op):
+                filtered_op_instance[op_name] = op
+        func_attrs["op_instance"] = filtered_op_instance
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
index 962b0a7d7..52edac942 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_epilogue.py
@@ -17,7 +17,7 @@
 """
 from typing import Dict, List, NamedTuple
 
-from ....compiler.ops.common.epilogue import EpilogueOp
+from aitemplate.compiler.ops.common.epilogue import EpilogueOp
 
 
 class GeMMEpilogueSpec(NamedTuple):
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
index 7447b9219..20316c33a 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear(bias=false)`
 When used for `linear`, need to set A->Data, B->Weight
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
index c9bccc2ed..579d0f395 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
index 006a6b077..2cfd2aabc 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
index e0b35b918..0b4919619 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
index 62642576a..afa9723f7 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_add_relu.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
index 136fa6bc2..998798618 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_add_relu.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
index adfc26f34..4822664c3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_fast_gelu.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear + swish`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
index 4ecade28f..5a0c6e1f6 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_hardswish.py
@@ -26,7 +26,7 @@
 # pylint: disable=C0415,W0613
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.config")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.config")
 def gemm_config(func_attrs, dtype="float16"):
     """Extract (operation name, operation instance) pair from
     all operation candidates.
@@ -49,7 +49,7 @@ def gemm_config(func_attrs, dtype="float16"):
     common.make_fproc_f16(func_attrs, RCR, op_kind, extra_kind)
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.gen_profiler")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.gen_profiler")
 def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
     """Generates standalone executables for profiler.
 
@@ -72,7 +72,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
     )
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.gen_function")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.gen_function")
 def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     """Generates function body.
 
@@ -106,7 +106,7 @@ def gemm_gen_function(func_attrs, exec_cond_template, dim_info_dict):
     )
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.func_decl")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.func_decl")
 def gemm_gen_function_decl(func_attrs):
     """Generates function declarations.
 
@@ -124,7 +124,7 @@ def gemm_gen_function_decl(func_attrs):
     return common.gen_function_decl(func_name=func_name, gemm_flag="bias_hardswish")
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.func_call")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.func_call")
 def gemm_gen_function_call(func_attrs, indent="  "):
     """Generates function call.
 
@@ -143,7 +143,7 @@ def gemm_gen_function_call(func_attrs, indent="  "):
     return common.gen_function_call(func_attrs, indent, gemm_flag="bias_hardswish")
 
 
-@registry.reg("rocm.gemm_rcr_bias_swish.filter")
+@registry.reg("rocm.gemm_rcr_bias_hardswish.filter")
 def gemm_function_filter(cfg, func_attrs, x_shape):
     """Generates function filter.
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
index a2233d6d1..9d7108d1e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
index d1548d4a3..51fe1c11f 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_add.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 EXTRA_CODE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
index a8a8e5534..a4e0d1991 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_mul_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
index ffacf0417..85fab9657 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute.py
@@ -18,9 +18,33 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
+
+
+ARGS_PARSER_TEMPLATE = jinja2.Template(
+    """
+  int64_t M = std::stoi(argv[1]);
+  int64_t N = std::stoi(argv[2]);
+  int64_t K = std::stoi(argv[3]);
+  int64_t split_k = std::atoi(argv[4]);
+  int64_t G1 = std::atoi(argv[5]);
+  int64_t G2 = std::atoi(argv[6]);
+  int64_t G3 = std::atoi(argv[7]);
+  int64_t a_dim0 = M;
+  int64_t a_dim1 = K;
+  int64_t b_dim0 = N;
+  int64_t b_dim1 = K;
+  int64_t c_dim0 = M;
+  int64_t c_dim1 = N;
+  int64_t p_dim0 = G1;
+  int64_t p_dim1 = G2;
+  int64_t p_dim2 = G3;
+"""
+)
 
 
 @registry.reg("rocm.gemm_rcr_bias_permute.config")
@@ -64,7 +88,7 @@ def gemm_gen_profiler(func_attrs, workdir, dim_info_dict):
         func_attrs=func_attrs,
         workdir=workdir,
         dim_info_dict=dim_info_dict,
-        args_parse=RCR.args_parse,
+        args_parse=ARGS_PARSER_TEMPLATE.render(),
         gemm_flag="bias_permute",
         extra_code="const int G1={}, G2={}, G3={};".format(
             func_attrs["shape"][0],
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
index a0b96d106..4d8ba2a14 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m2n3.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
index 596dee60c..07df32276 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_permute_m3n2.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
index 53110aedb..1e744128e 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_relu.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear + relu`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
index f745a9345..f0943c5f7 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
index b15afef17..49d70d02f 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
index b79bc346f..802bf22b2 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 EXTRA_CODE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
index 90bb67053..fdfab8630 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_swish.py
@@ -21,9 +21,9 @@
 
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 EXTRA_CODE = jinja2.Template(
@@ -54,10 +54,10 @@
     };
     template <>
     __host__ __device__ constexpr void
-    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    operator()<ck::half_t>(ck::half_t& y, const ck::half_t& x0, const ck::half_t& x1) const
     {
-        const half_t a = x0 + x1;
-        y              = a / (type_convert<half_t>(1.0) + type_convert<half_t>(exp(ck::type_convert<float>(-a))));
+        const ck::half_t a = x0 + x1;
+        y                  = a / (ck::type_convert<ck::half_t>(1.0) + ck::type_convert<ck::half_t>(exp(ck::type_convert<float>(-a))));
     };
 };
 } // namespace
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
index 029836574..804b47505 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_bias_tanh.py
@@ -20,9 +20,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
index d9350fd20..6661fd1c3 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rcr_permute_m2n3.py
@@ -21,9 +21,9 @@
 """
 import jinja2
 
-from ... import registry
-from . import common, permute_common
-from .layout import RCR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RCR
 
 
 ARGS_PARSER_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
index e86c9dca3..c67848258 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr.py
@@ -18,9 +18,9 @@
 This is used for `torch.mm`
 When used for `mm`, need to set A->Data, B->Weight
 """
-from ... import registry
-from . import common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 # pylint: disable=C0415,W0613
 
diff --git a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
index 005f51bd3..ab34001c8 100644
--- a/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/backend/rocm/gemm/gemm_rrr_bias_permute.py
@@ -18,9 +18,9 @@
 This is used for `torch.nn.functional.linear`
 When used for `linear`, need to set A->Data, B->Weight, C->Bias
 """
-from ... import registry
-from . import common, permute_common
-from .layout import RRR
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.gemm import common, permute_common
+from aitemplate.backend.rocm.gemm.layout import RRR
 
 
 @registry.reg("rocm.gemm_rrr_bias_permute.config")
diff --git a/python/aitemplate/backend/rocm/lib_template.py b/python/aitemplate/backend/rocm/lib_template.py
index 9dfbf11e2..6b97a0d21 100644
--- a/python/aitemplate/backend/rocm/lib_template.py
+++ b/python/aitemplate/backend/rocm/lib_template.py
@@ -17,7 +17,8 @@
 """
 import jinja2
 
-from .. import registry
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
 
 # pylint: disable=W0613
 
@@ -41,6 +42,14 @@ def void_ptr_decl(name, dtype="float16", indent="  "):
         type_string = "int64_t*"
     elif dtype == "bool":
         type_string = "bool*"
+    elif dtype == "int32":
+        type_string = "int*"
     else:
         raise NotImplementedError
     return PTR_TEMPLATE.render(name=name, dtype=type_string, indent=indent)
+
+
+@registry.reg("rocm.lib.dtype_to_backend_type")
+def dtype_to_backend_type(dtype):
+    backend_spec = ROCMSpec()
+    return backend_spec.dtype_to_backend_type(dtype)
diff --git a/python/aitemplate/backend/rocm/normalization/__init__.py b/python/aitemplate/backend/rocm/normalization/__init__.py
index fb90889b3..4585e7cee 100644
--- a/python/aitemplate/backend/rocm/normalization/__init__.py
+++ b/python/aitemplate/backend/rocm/normalization/__init__.py
@@ -15,4 +15,4 @@
 """
 Common modules for backends
 """
-from . import norm_common, softmax  # noqa
+from aitemplate.backend.rocm.normalization import norm_common, softmax  # noqa
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm.py b/python/aitemplate/backend/rocm/normalization/groupnorm.py
index f76465a7a..466d77b42 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm.py
@@ -15,21 +15,23 @@
 """
 Groupnorm codegen for ROCM.
 """
-
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...target import Target
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 """
 )
 
@@ -38,18 +40,22 @@
 {%if use_swish %}
 struct YElementOp
 {
-    template <typename T>
-    __host__ __device__ void operator()(T& y, const T& x) const
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
     {
-        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
-                          ck::is_same<T, ck::half_t>::value,
+        static_assert(ck::is_same<X, float>::value || ck::is_same<X, double>::value ||
+                          ck::is_same<X, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        static_assert(ck::is_same<Y, float>::value || ck::is_same<Y, double>::value ||
+                          ck::is_same<Y, ck::half_t>::value,
                       "Data type is not supported by this operation!");
 
-        T a;
+        X a;
 
         ck::tensor_operation::element_wise::Sigmoid{}(a, x);
 
-        y = x * a;
+        y = ck::type_convert<Y>(x * a);
     };
 };
 
@@ -77,7 +83,7 @@
 
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
@@ -96,7 +102,6 @@
     """
     C = C / G;
     std::vector<ck::index_t> i_inStrides;
-
     i_inStrides.push_back(H * W * G * C);
     i_inStrides.push_back(W * G * C);
     i_inStrides.push_back(G * C);
@@ -110,6 +115,10 @@
     gamma_beta_Strides.push_back(C);
     gamma_beta_Strides.push_back(1);
 
+    std::vector<ck::index_t> save_mean_strides;
+    save_mean_strides.push_back(G);
+    save_mean_strides.push_back(1);
+
     auto device_instance = {{instance}}{};
     auto argument_ptr = device_instance.MakeArgumentPointer(
         {static_cast<ck::index_t>(N),
@@ -121,8 +130,10 @@
         gamma_beta_Strides,
         gamma_beta_Strides,
         i_inStrides, // y stride
+        save_mean_strides,
+        save_mean_strides,
         {1, 2, 4}, // reduction dimension: [H, W, C]
-        1e-5,
+        1e-6,
         static_cast<ck::half_t *>(input),
         static_cast<ck::half_t *>(gamma),
         static_cast<ck::half_t *>(beta),
diff --git a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
index 01872be32..f9ad7dbec 100644
--- a/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
+++ b/python/aitemplate/backend/rocm/normalization/groupnorm_swish.py
@@ -14,9 +14,9 @@
 #
 from typing import Any, Dict
 
-from ... import registry
+from aitemplate.backend import registry
 
-from .groupnorm import (
+from aitemplate.backend.rocm.normalization.groupnorm import (
     groupnorm_extract_config,
     groupnorm_gen_func_call,
     groupnorm_gen_func_decl,
diff --git a/python/aitemplate/backend/rocm/normalization/layernorm.py b/python/aitemplate/backend/rocm/normalization/layernorm.py
index 5e07f09b2..5d99315e3 100644
--- a/python/aitemplate/backend/rocm/normalization/layernorm.py
+++ b/python/aitemplate/backend/rocm/normalization/layernorm.py
@@ -15,21 +15,23 @@
 """
 Layernorm codegen for ROCM.
 """
-
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
+from aitemplate.backend.target import Target
 
-from ... import registry
-from ...target import Target
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
-#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_fwd_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 """
 )
 
@@ -45,7 +47,7 @@
 
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
@@ -66,6 +68,9 @@
     """
     std::vector<ck::index_t> i_inStrides;
     std::vector<ck::index_t> i_outStrides;
+    std::vector<ck::index_t> save_mean_strides;
+    save_mean_strides.push_back(1);
+
     {% if input_strides is defined %}
     i_inStrides.push_back({{input_strides[-2]}});
     i_inStrides.push_back({{input_strides[-1]}});
@@ -89,6 +94,8 @@
         std::vector<ck::index_t>{0, 1},
         std::vector<ck::index_t>{0, 1},
         i_outStrides,
+        save_mean_strides,
+        save_mean_strides,
         {1},
         {{eps}},
         static_cast<ck::half_t *>(input) + {{ input_offset if input_offset is defined else 0 }},
diff --git a/python/aitemplate/backend/rocm/normalization/norm_common.py b/python/aitemplate/backend/rocm/normalization/norm_common.py
index 3c818052f..4f064b2a8 100644
--- a/python/aitemplate/backend/rocm/normalization/norm_common.py
+++ b/python/aitemplate/backend/rocm/normalization/norm_common.py
@@ -18,12 +18,13 @@
 
 import os
 import re
+from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, OrderedDict
+from typing import Any, Dict
 
 import jinja2
 
-from ...target import Target
+from aitemplate.backend.target import Target
 
 FUNC_CALL_PARAM_TEMPLATE = jinja2.Template("(void *)({{name}})")
 
@@ -112,24 +113,6 @@
   rocrand_generator generator;
 };
 
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
-{
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
-void DeviceMem::ToDevice(const void* p) const
-{
-  hipGetErrorString(
-        hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const
-{
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() { hipGetErrorString(hipFree(mpDeviceBuf)); }
 struct KernelTimerImpl
 {
   KernelTimerImpl() {
@@ -140,12 +123,12 @@
     hipGetErrorString(hipEventDestroy(mStart));
     hipGetErrorString(hipEventDestroy(mEnd));
   }
-  void Start() {
+  void Start(hipStream_t stream) {
     hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
+    hipGetErrorString(hipEventRecord(mStart, stream));
   }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
+  void End(hipStream_t stream) {
+    hipGetErrorString(hipEventRecord(mEnd, stream));
     hipGetErrorString(hipEventSynchronize(mEnd));
   }
   float GetElapsedTime() const {
@@ -173,16 +156,16 @@
   hipStream_t stream = nullptr;
   {{tensor_decl}}
   // warmup
-  for(int i = 0; i < 3; ++i) {
+  for(int i = 0; i < 5; ++i) {
     {{func_call}}
   }
   // run
   KernelTimerImpl timer;
-  timer.Start();
-  for(int i = 0; i < 5; ++i) {
+  timer.Start(stream);
+  for(int i = 0; i < 10; ++i) {
     {{func_call}}
   }
-  timer.End();
+  timer.End(stream);
   std::cout << "OP:" << "{{op_name}}" << ",";
   std::cout << "TIME:" << timer.GetElapsedTime() << ",";
   std::cout << "WS:" << GLOBAL_WORKSPACE_SIZE << std::endl;
@@ -200,8 +183,7 @@
 #include <random>
 #include <rocrand/rocrand.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
+
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -340,7 +322,6 @@ def gen_profiler(
     op_instance = func_attrs["op_instance"]
     file_pairs = []
     for op_name, op in op_instance.items():
-
         config = emit_instance(op)
         config_name = extract_config_name(config)
         instances = INSTANCE_TEMPLATE.render(
diff --git a/python/aitemplate/backend/rocm/normalization/softmax.py b/python/aitemplate/backend/rocm/normalization/softmax.py
index 576dd63b5..9b534d7a1 100644
--- a/python/aitemplate/backend/rocm/normalization/softmax.py
+++ b/python/aitemplate/backend/rocm/normalization/softmax.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from ....compiler.base import IntImm
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.normalization import norm_common
 
-from ... import registry
-from . import norm_common
+from aitemplate.compiler.base import IntImm
 
 EXTRA_HEADERS = jinja2.Template(
     """
@@ -37,7 +37,7 @@
   int64_t ptr_sz = in_{{ range(rank)|join(' * in_') }};
   // TODO: special pool size for 8M L2 cache
   // need to tune it for other devices
-  int64_t mem_pool_sz = std::max(2,  std::min(64, int((1 << 23) / ptr_sz)));
+  int64_t mem_pool_sz = std::max(1,  std::min(64, int((1 << 23) / ptr_sz)));
 
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // in: index 0
   memory_pool->AllocateHalfTensor(ptr_sz, mem_pool_sz);  // out: index 1
@@ -62,8 +62,8 @@
     auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
                                                             i_inStrides,
                                                             reduceDims,
-                                                            &alpha,
-                                                            &beta,
+                                                            alpha,
+                                                            beta,
                                                             static_cast<ck::half_t *>(input),
                                                             static_cast<ck::half_t *>(output),
                                                             ck::tensor_operation::element_wise::PassThrough{},
diff --git a/python/aitemplate/backend/rocm/padding/__init__.py b/python/aitemplate/backend/rocm/padding/__init__.py
new file mode 100644
index 000000000..455e327d6
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/__init__.py
@@ -0,0 +1,20 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA padding init
+"""
+from . import nhwc3to4, nhwc3to8, pad_last_dim
+
+__all__ = ["nhwc3to8", "pad_last_dim", "nhwc3to4"]
diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to4.py b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
new file mode 100644
index 000000000..f652d8b75
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to4.py
@@ -0,0 +1,225 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to4 op
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  hipStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to4_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+// fast kernel for c_in = 3 & c_out = 4
+template <typename Tio, typename Telement, int element_in_Tio>
+__global__ void nhwc_padding_channel_3To4_kernel(const int32_t n,
+                                                 const int32_t h,
+                                                 const int32_t w,
+                                                 const Tio *input,
+                                                 Tio *output,
+                                                 const int32_t max_output_element,
+                                                 const int32_t max_input_element,
+                                                 const Tio zero_io,
+                                                 const Telement zero_element){
+  __shared__ Tio shm[192];
+  const int tidx = blockIdx.x * 192 + threadIdx.x;
+  const int threadidx = threadIdx.x;
+
+  shm[threadIdx.x] = tidx >= max_input_element ? zero_io : input[tidx];
+  __syncthreads();
+
+  const int ouput_offset = blockIdx.x * 256;
+  const int lower_bound = max_output_element < ouput_offset + 256 ? max_output_element : ouput_offset + 256;
+  for (int i = ouput_offset + threadidx, j = threadidx ; i < lower_bound ; i+=192, j+=192)
+  {
+    const Telement* shm_element = (const Telement*)shm + j*3*element_in_Tio/4;
+    Telement array[element_in_Tio];
+    #pragma unroll
+    for (int k = 0 ; k < element_in_Tio ; k++)
+      array[k] = ((k+1)%4 == 0) ? zero_element : shm_element[(k > 3) ? (k - 1) : k];
+    output[i] = *((const Tio *)array);
+  }
+}
+
+template <typename ElemT>
+void nhwc3to4_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       hipStream_t stream) {
+  dim3 block(192);
+  const int nhw = NI * HI * WI;
+  const int nhwc = nhw * 3;
+  CHECK_EQ(nhw % 8, 0);
+  const int element_in_Tio = sizeof(int4) / sizeof(ElemT);
+  const int max_input_element = nhwc / element_in_Tio;
+  const int max_output_element = nhw * 4 / element_in_Tio;
+  const int4 zero_io = {0, 0, 0, 0};
+  const ElemT zero_element = static_cast<ElemT>(0.0f);
+  dim3 grid((nhwc + 192 * element_in_Tio - 1)/(192 * element_in_Tio));
+  nhwc_padding_channel_3To4_kernel<int4, ElemT, element_in_Tio><<<grid, block, 0, stream>>>
+          (NI, HI, WI,
+          (const int4 *)in_ptr,
+          (int4 *)out_ptr,
+          max_output_element,
+          max_input_element,
+          zero_io,
+          zero_element);
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.nhwc3to4.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("rocm.nhwc3to4.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("rocm.nhwc3to4.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/padding/nhwc3to8.py b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
new file mode 100644
index 000000000..01e508a2c
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/nhwc3to8.py
@@ -0,0 +1,237 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+CUDA codegen for nhwc3to8 op
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  int64_t*,
+  hipStream_t
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{{indent}}    {{p_batch}},
+{{indent}}    {{p_in_h}},
+{{indent}}    {{p_in_w}},
+{{indent}}    {{p_out_batch}},
+{{indent}}    {{p_out_h}},
+{{indent}}    {{p_out_w}},
+{{indent}}    stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}nhwc3to8_launcher<{{elem_input_type}}>(
+{{indent}}    static_cast<const {{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{{indent}}    NI,
+{{indent}}    HI,
+{{indent}}    WI,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+// load 128 bit every time (n ElemT = 4 float)
+// use as many as thread with factor of 3:
+// each time load num_thread * n ElemT = num_thread / 3 * n ElemT * 3ch ->
+// num_thread / 3 * n ElemT * n ElemT ch
+
+template<typename ElemT, int num_thread>
+__global__ void nhwc3to8_kernel(const float4* input,
+                                float4* output,
+                                const int NI,
+                                const int HI,
+                                const int WI,
+                                const int max_in_elements,
+                                const int max_out_elements) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  __shared__ float4 shared_mem[num_thread];
+  const int out_offset = num_thread * num_elem_t_in_float4 / 3;
+  const float4 zero4 = {0.0f, 0.0f, 0.0f, 0.0f};
+  const ElemT zero = static_cast<ElemT>(0.f);
+  const int in_idx = blockIdx.x * num_thread + threadIdx.x;
+  const int tid = threadIdx.x;
+
+  shared_mem[tid] = in_idx >= max_in_elements ? zero4 : __ldg(input + in_idx);
+  __syncthreads();
+
+  const int out_start_idx = blockIdx.x * out_offset;
+  const int boundary = out_start_idx + out_offset > max_out_elements ? max_out_elements : out_start_idx + out_offset;
+  for (int i = out_start_idx + tid, j = tid; i < boundary; i += num_thread, j += num_thread) {
+    const ElemT* smem_element = (const ElemT*)shared_mem + j * 3;
+    ElemT tmp[num_elem_t_in_float4];
+
+    #pragma unroll
+    for (int k = 0; k < num_elem_t_in_float4; ++k) {
+      tmp[k] = k < 3 ? smem_element[k] : zero;
+    }
+    output[i] = *((const float4*)tmp);
+  }
+}
+
+template <typename ElemT>
+void nhwc3to8_launcher(const ElemT* in_ptr,
+                       ElemT* out_ptr,
+                       int NI,
+                       int HI,
+                       int WI,
+                       hipStream_t stream) {
+  constexpr int num_elem_t_in_float4 = sizeof(float4) / sizeof(ElemT);
+  constexpr int nthread = 240;
+  const int NHW = NI * HI * WI;
+  if (NHW % num_elem_t_in_float4 != 0) {
+    throw std::runtime_error(
+        "NHW (" + std::to_string(NHW) + ") mod num_elem_t_in_float4 (" +
+        std::to_string(num_elem_t_in_float4) + ") is not 0"
+    );
+  }
+  static_assert(nthread % 3 == 0);
+  const int max_in_elements = NHW * 3 / num_elem_t_in_float4;
+  const int max_out_elements = NHW * num_elem_t_in_float4 / num_elem_t_in_float4;
+  dim3 thread_block(nthread);
+  dim3 grid((NHW * 3 + nthread * num_elem_t_in_float4 -1) / (nthread * num_elem_t_in_float4));
+  nhwc3to8_kernel<ElemT, nthread><<<grid, thread_block, 0, stream>>>(
+    (const float4*)in_ptr,
+    (float4*) out_ptr,
+    NI,
+    HI,
+    WI,
+    max_in_elements,
+    max_out_elements
+  );
+}
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    int64_t* batch,
+    int64_t* in_h,
+    int64_t* in_w,
+    int64_t* out_batch,
+    int64_t* out_h,
+    int64_t* out_w,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.nhwc3to8.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    shape_eval_func = shape_eval_template.render(
+        indent="  ",
+        dtype="int64_t ",
+        x_dim0="*batch",
+        x_dim1="*in_h",
+        x_dim2="*in_w",
+    )
+    shape_save_func = shape_save_template.render(
+        indent="  ",
+        y_dim0="*out_batch",
+        y_dim1="*out_h",
+        y_dim2="*out_w",
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(elem_input_type=elem_input_type)
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+    )
+
+
+@registry.reg("rocm.nhwc3to8.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name)
+
+
+@registry.reg("rocm.nhwc3to8.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        p_batch="&" + xshape[0]._attrs["name"],
+        p_in_h="&" + xshape[1]._attrs["name"],
+        p_in_w="&" + xshape[2]._attrs["name"],
+        p_out_batch="&" + yshape[0]._attrs["name"],
+        p_out_h="&" + yshape[1]._attrs["name"],
+        p_out_w="&" + yshape[2]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/padding/pad_last_dim.py b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
new file mode 100644
index 000000000..5c5936e77
--- /dev/null
+++ b/python/aitemplate/backend/rocm/padding/pad_last_dim.py
@@ -0,0 +1,261 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Codegen functions for pad_last_dim.
+"""
+import jinja2
+
+from ... import registry
+from ...backend_spec import ROCMSpec
+
+# pylint: disable=C0301,W0613,W0612
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  void*,
+  void*,
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  {%for i in range(ndim)%}
+  int64_t*,
+  {% endfor %}
+  int out_dim,
+  hipStream_t stream
+);
+"""
+)
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{func_name}}(
+{{indent}}    {{in_ptr}},
+{{indent}}    {{out_ptr}},
+{% for dim in xshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{% for dim in yshape %}
+{{indent}}{{dim}},
+{% endfor %}
+{{indent}}  {{out_dim}},
+{{indent}}  stream
+{{indent}});
+"""
+)
+
+
+EXEC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}padding4d_launcher<{{elem_input_type}}, {{elem_input_type2}}>(
+{{indent}}    static_cast<{{elem_input_type}}*>(in_ptr),
+{{indent}}    static_cast<{{elem_input_type}}*>(out_ptr),
+{%for i in range(4 - ndim)%}
+1,
+{% endfor %}
+{%for i in range(ndim)%}
+{{indent}}    *x_dim{{i}},
+{% endfor %}
+{{indent}}    out_dim,
+{{indent}}    stream
+{{indent}});
+{{indent}}return;
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+namespace {
+template <typename T>
+__global__ void padding4d_kernel(const T* input,
+                                 T* output,
+                                 const int32_t x_dim0,
+                                 const int32_t x_dim1,
+                                 const int32_t x_dim2,
+                                 const int32_t x_dim3,
+                                 const int32_t out_dim,
+                                 const T zero){
+
+  const int32_t idx_jump       = blockDim.x * gridDim.x;
+  const int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * out_dim;
+
+  int32_t dim3_idx = 0;
+  int32_t dim2_idx = 0;
+  int32_t dim1_idx = 0;
+  int32_t dim0_idx = 0;
+  int32_t residual = 0;
+
+  T value;
+  for (int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_elements; idx += idx_jump) {
+
+    dim3_idx = idx % out_dim;
+    if (dim3_idx >= x_dim3){
+      value = zero;
+    }
+    else{
+      residual = idx / out_dim;
+      dim2_idx = residual % x_dim2;
+      residual = residual / x_dim2;
+      dim1_idx = residual % x_dim1;
+      dim0_idx = residual / x_dim1;
+      residual = ((dim0_idx * x_dim1 + dim1_idx) * x_dim2 + dim2_idx) * x_dim3 + dim3_idx;
+      value = input[residual];
+    }
+    output[idx] = value;
+  }
+}
+
+
+template <typename ElemT, typename ElemT2>
+void padding4d_launcher(ElemT* in_ptr,
+                        ElemT* out_ptr,
+                        const int32_t x_dim0,
+                        const int32_t x_dim1,
+                        const int32_t x_dim2,
+                        const int32_t x_dim3,
+                        const int32_t out_dim,
+                        hipStream_t stream) {
+  static_assert(sizeof(ElemT2) % sizeof(ElemT) == 0);
+  const int block_size = 256;
+  if ((out_dim % 2) == 0 && (x_dim3 % 2) == 0) {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3 / 2;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const ElemT2 zero  = {0.0f, 0.0f};
+    padding4d_kernel<ElemT2><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const ElemT2*>(in_ptr), reinterpret_cast<ElemT2*>(out_ptr),
+        x_dim0, x_dim1, x_dim2, x_dim3 / 2,
+        out_dim / 2,
+        zero
+    );
+  } else {
+    int32_t total_elements = x_dim0 * x_dim1 * x_dim2 * x_dim3;
+    dim3 grid((total_elements + 255) /  block_size);
+    dim3 block(block_size);
+    const ElemT zero = static_cast<ElemT>(0.f);
+    padding4d_kernel<ElemT><<<grid, block, 0, stream>>>(
+        in_ptr, out_ptr,
+        x_dim0, x_dim1, x_dim2, x_dim3,
+        out_dim,
+        zero
+    );
+  }
+}
+
+} // namespace
+
+void {{function_name}} (
+    void* in_ptr,
+    void* out_ptr,
+    {%for i in range(ndim)%}
+    int64_t* x_dim{{i}},
+    {% endfor %}
+    {%for i in range(ndim)%}
+    int64_t* y_dim{{i}},
+    {% endfor %}
+    int out_dim,
+    hipStream_t stream
+) {
+  {{shape_function}}
+  {{exec_paths}}
+}
+
+"""
+)
+
+
+@registry.reg("rocm.pad_last_dim.gen_function")
+def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+    """
+
+    Parameters
+    ----------
+    func_attrs : [type]
+        [description]
+    template_path : [type]
+        [description]
+    shape_eval_template : [type]
+        [description]
+    shape_save_template : [type]
+        [description]
+
+    Returns
+    -------
+    [type]
+        [description]
+    """
+    func_name = func_attrs["name"]
+    backend_spec = ROCMSpec()
+    elem_input_type = backend_spec.dtype_to_backend_type(
+        func_attrs["inputs"][0]._attrs["dtype"]
+    )
+    elem_input_type2 = None
+    if elem_input_type == "half":
+        elem_input_type2 = "half2"
+    elif elem_input_type == "float":
+        elem_input_type2 = "float2"
+    else:
+        raise NotImplementedError(f"unsupported {elem_input_type=}")
+    ndim = func_attrs["ndim"]
+    xshape = ["*x_dim%d" % i for i in range(ndim)]
+    shape_eval_func = shape_eval_template.render(
+        indent="  ", dtype="int64_t ", shape=xshape, out_dim="out_dim"
+    )
+    yshape = ["*y_dim%d" % i for i in range(ndim - 1)]
+    shape_save_func = shape_save_template.render(
+        indent="  ", shape=yshape, last_dim="*y_dim%d" % (ndim - 1)
+    )
+    shape_func = shape_eval_func + shape_save_func
+    exec_paths = EXEC_TEMPLATE.render(
+        elem_input_type=elem_input_type,
+        elem_input_type2=elem_input_type2,
+        ndim=func_attrs["ndim"],
+        indent="  ",
+    )
+    return SRC_TEMPLATE.render(
+        function_name=func_name,
+        elem_input_type=elem_input_type,
+        shape_function=shape_func,
+        exec_paths=exec_paths,
+        ndim=func_attrs["ndim"],
+    )
+
+
+@registry.reg("rocm.pad_last_dim.func_decl")
+def gen_function_decl(func_attrs):
+    func_name = func_attrs["name"]
+    return FUNC_DECL_TEMPLATE.render(func_name=func_name, ndim=func_attrs["ndim"])
+
+
+@registry.reg("rocm.pad_last_dim.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    x = func_attrs["inputs"][0]
+    xshape = x._attrs["shape"]
+    xshape_args = ["&" + dim._attrs["name"] for dim in xshape]
+    y = func_attrs["outputs"][0]
+    yshape = y._attrs["shape"]
+    yshape_args = ["&" + dim._attrs["name"] for dim in yshape]
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        in_ptr=x._attrs["name"],
+        out_ptr=y._attrs["name"],
+        xshape=xshape_args,
+        yshape=yshape_args,
+        out_dim=func_attrs["out_dim"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/pool2d/__init__.py b/python/aitemplate/backend/rocm/pool2d/__init__.py
index 4a1ee6bf3..072cfd047 100644
--- a/python/aitemplate/backend/rocm/pool2d/__init__.py
+++ b/python/aitemplate/backend/rocm/pool2d/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM pool2d init
 """
-from . import avg_pool2d, max_pool2d
+from aitemplate.backend.rocm.pool2d import avg_pool2d, max_pool2d
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
index 4ec7db8b0..cf1fffef8 100644
--- a/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/avg_pool2d.py
@@ -15,20 +15,20 @@
 """
 ROCM avg_pool2d funcs
 """
-from ... import registry
-from . import pool2d
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.pool2d import pool2d
 
 
 @registry.reg("rocm.avg_pool2d.gen_function")
 def max_pool2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return pool2d.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
index 38a108ddf..9f67236f7 100644
--- a/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/max_pool2d.py
@@ -15,20 +15,20 @@
 """
 ROCM max_pool2d funcs
 """
-from ... import registry
-from . import pool2d
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.pool2d import pool2d
 
 
 @registry.reg("rocm.max_pool2d.gen_function")
 def max_pool2d_gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
     return pool2d.gen_function(
         func_attrs,
-        exec_cond_remplate,
+        exec_cond_template,
         shape_eval_template,
         shape_save_template,
     )
diff --git a/python/aitemplate/backend/rocm/pool2d/pool2d.py b/python/aitemplate/backend/rocm/pool2d/pool2d.py
index 8b24b8509..85316802e 100644
--- a/python/aitemplate/backend/rocm/pool2d/pool2d.py
+++ b/python/aitemplate/backend/rocm/pool2d/pool2d.py
@@ -23,8 +23,8 @@
 
 INSTANCE_TEMPLATE = jinja2.Template(
     """
-using {{name}} = ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
-ck::half_t, ck::half_t, float, {{reduce_func}}, false, 64, 64, 1, 4, 1, 4>;
+using {{name}} = ck::tensor_operation::device::DevicePool2dFwd_NHWC_NHWC<
+ck::half_t, ck::half_t, ck::index_t, float, {{reduce_func}}, false, 64, 64, 1, 4, 1, 4>;
 """
 )
 
@@ -35,14 +35,17 @@
 {{indent}}auto argument_ptr = op.MakeArgumentPointer(static_cast<ck::half_t *>(in_ptr),
 {{indent}}                                           static_cast<ck::half_t *>(out_ptr),
 {{indent}}                                           nullptr,
-{{indent}}                                           *batch,
-{{indent}}                                           *in_ch,
 {{indent}}                                           input_shape,
 {{indent}}                                           kernel_shape,
 {{indent}}                                           output_shape,
+{{indent}}                                           input_stride,
+{{indent}}                                           output_stride,
+{{indent}}                                           indices_stride,
 {{indent}}                                           conv_filter_strides,
+{{indent}}                                           dilations,
 {{indent}}                                           input_left_pads,
-{{indent}}                                           input_right_pads);
+{{indent}}                                           input_right_pads,
+{{indent}}                                           {2, 3});
 {{indent}}if(!op.IsSupportedArgument(argument_ptr.get())) {
 {{indent}}  LOG(FATAL) << "wrong! " << op.GetTypeString() << " with the specified compilation parameters does not support this Pool problem.";
 {{indent}}}
@@ -59,8 +62,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "logging.h"
-#include "include/ck/utility/print.hpp"
-#include "library/include/ck/library/utility/device_memory.hpp"
+
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -88,19 +90,23 @@
     ) {
   {{shape_function}}
 
-  const std::array<ck::index_t, 2> conv_filter_strides{static_cast<ck::index_t>(stride),
+  const std::vector<ck::index_t> conv_filter_strides{static_cast<ck::index_t>(stride),
     static_cast<ck::index_t>(stride)};
-  const std::array<ck::index_t, 2> input_left_pads{static_cast<ck::index_t>(pad),
+  const std::vector<ck::index_t> input_left_pads{static_cast<ck::index_t>(pad),
     static_cast<ck::index_t>(pad)};
-  const std::array<ck::index_t, 2> input_right_pads{static_cast<ck::index_t>(pad),
+  const std::vector<ck::index_t> input_right_pads{static_cast<ck::index_t>(pad),
     static_cast<ck::index_t>(pad)};
-  const std::array<ck::index_t, 2> input_shape{static_cast<ck::index_t>(*in_h),
+  const std::vector<ck::index_t> input_shape{static_cast<ck::index_t>(*batch), static_cast<ck::index_t>(*in_ch), static_cast<ck::index_t>(*in_h),
     static_cast<ck::index_t>(*in_w)};
-  const std::array<ck::index_t, 2> kernel_shape{static_cast<ck::index_t>(kernel_h),
-    static_cast<ck::index_t>(kernel_w)};
-  const std::array<ck::index_t, 2> output_shape{static_cast<ck::index_t>(*out_h),
+  const std::vector<ck::index_t> kernel_shape{static_cast<ck::index_t>(kernel_h), static_cast<ck::index_t>(kernel_w)};
+  const std::vector<ck::index_t> output_shape{static_cast<ck::index_t>(*batch), static_cast<ck::index_t>(*in_ch), static_cast<ck::index_t>(*out_h),
     static_cast<ck::index_t>(*out_w)};
 
+  const std::vector<ck::index_t> input_stride{static_cast<ck::index_t>(CI*HI*WI), 1, static_cast<ck::index_t>(WI*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> output_stride{static_cast<ck::index_t>(CI*HO*WO), 1, static_cast<ck::index_t>(WO*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> indices_stride{static_cast<ck::index_t>(CI*HO*WO), 1, static_cast<ck::index_t>(WO*CI), static_cast<ck::index_t>(CI)};
+  const std::vector<ck::index_t> dilations{1, 1};
+
   {{exec_paths}}
 
   throw std::runtime_error(
@@ -156,7 +162,7 @@
 
 def gen_function(
     func_attrs,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -226,7 +232,7 @@ def gen_function(
     for key in instances:
         fname = "f" + sha1(key.encode()).hexdigest()
         program = EXEC_TEMPLATE.render(indent="    ", instance=fname)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return SRC_TEMPLATE.render(
         instances=instance_decl,
diff --git a/python/aitemplate/backend/rocm/target_def.py b/python/aitemplate/backend/rocm/target_def.py
index 0c86cfbe3..4a2797942 100644
--- a/python/aitemplate/backend/rocm/target_def.py
+++ b/python/aitemplate/backend/rocm/target_def.py
@@ -18,22 +18,30 @@
 # pylint: disable=W0702,W0707,W0611,C0415
 
 import json
+import logging
 import os
 import re
 import shutil
 import sys
 from typing import List
 
-from aitemplate.backend.target import AIT_STATIC_FILES_PATH
+from aitemplate.backend import registry
 
-from ...utils import logger
+from aitemplate.backend.target import (
+    AIT_STATIC_FILES_PATH,
+    COMPOSABLE_KERNEL_PATH,
+    Target,
+)
 
-from .. import registry
-from ..target import COMPOSABLE_KERNEL_PATH, Target
+from aitemplate.utils import environ
+from aitemplate.utils.misc import is_linux
 
 # pylint: disable=W0613
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class ROCM(Target):
     """ROCM target.
 
@@ -79,7 +87,7 @@ def _pkg_path(self):
         rocm_path = os.environ.get("ROCM_PATH", "/opt/rocm")
         return rocm_path
 
-    def _get_ck_paths(self):
+    def _get_ck_paths(self) -> List[str]:
         ck_paths = [
             os.path.join(self._template_path),
             os.path.join(self._template_path, "include/"),
@@ -89,6 +97,9 @@ def _get_ck_paths(self):
         ]
         return ck_paths
 
+    def get_include_directories(self) -> List[str]:
+        return self._get_ck_paths()
+
     def _build_compile_options(self):
         """Build compilation commands, including compilation flag library and includes.
 
@@ -105,8 +116,9 @@ def _build_compile_options(self):
 
         ck_paths = self._get_ck_paths()
         options = [
-            "-O3",
+            environ.get_compiler_opt_level(),
             "-fPIC",
+            "-mcmodel=medium",
             "-fvisibility=hidden",
             "-std=c++17",
             "-w",
@@ -115,14 +127,9 @@ def _build_compile_options(self):
                 self._pkg_path()
             ),
         ]
-        if self._arch in {"GFX908", "gfx908"}:
-            options.append("-DCK_AMD_GPU_GFX908")
-            options.append("--amdgpu-target=gfx908")
-        elif self._arch in {"GFX90a", "gfx90a"}:
-            options.append("-DCK_AMD_GPU_GFX90A")
-            options.append("--amdgpu-target=gfx90a")
-        else:
+        if self._arch.lower() not in {"gfx908", "gfx90a", "gfx940", "gfx941", "gfx942"}:
             raise RuntimeError("Unsupported GPU Arch")
+        options.append("--offload-arch=native")
         for path in ck_paths:
             options.append("-I" + path)
         options.append("-I" + os.path.join(self.static_files_path, "include"))
@@ -259,9 +266,7 @@ def __init__(
         convert_hippcc_json = parutil.get_file_path(
             os.path.join("aitemplate/testing", "convert_hipcc_cmd")
         )
-        logger.info(
-            __name__, f"Load the hipcc compile option from {convert_hippcc_json}"
-        )
+        _LOGGER.info(f"Load the hipcc compile option from {convert_hippcc_json}")
         with open(convert_hippcc_json, "r") as hipcc_options_json:
             self.hipcc_options_json = json.load(hipcc_options_json)
 
@@ -283,7 +288,7 @@ def _build_compile_options(self):
 
         ck_paths = self._get_ck_paths()
         options = self.hipcc_options_json["args"] + [
-            "-O3",
+            environ.get_compiler_opt_level(),
             "-fPIC",
             "-fvisibility=hidden",
             "-std=c++17",
@@ -314,7 +319,16 @@ def binary_compile_cmd(self):
         There is no ld by default in the prod env. Instead, we use ld from the gvfs path.
         """
         ld = self.hipcc_options_json["ld"]
-        return " ".join([ld, "-r -b binary -o {target} {src}"])
+        objcopy = self.hipcc_options_json["objcopy"]
+        cmd = " ".join([ld, "-r -b binary -o {target} {src}"])
+        # Support models with >2GB constants on Linux only
+        if is_linux():
+            cmd += (
+                f" && {objcopy} --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def cc(self):
         return self.hipcc_options_json["hipcc_bin"]
diff --git a/python/aitemplate/backend/rocm/tensor/__init__.py b/python/aitemplate/backend/rocm/tensor/__init__.py
index c5fe33e23..d8da08c53 100644
--- a/python/aitemplate/backend/rocm/tensor/__init__.py
+++ b/python/aitemplate/backend/rocm/tensor/__init__.py
@@ -15,13 +15,19 @@
 """
 ROCM tensor ops module init
 """
-from . import (  # noqa
+from aitemplate.backend.rocm.tensor import (  # noqa
     argmax,
     batch_gather,
     concatenate,
     concatenate_tanh,
     dynamic_slice,
+    expand,
+    expand_static_shape,
+    full,
+    identity,
+    index_select,
     permute021,
+    permute0213,
     permute102,
     permute210,
     slice_reshape_scatter,
diff --git a/python/aitemplate/backend/rocm/tensor/argmax.py b/python/aitemplate/backend/rocm/tensor/argmax.py
index 15049bed5..78e54fb29 100644
--- a/python/aitemplate/backend/rocm/tensor/argmax.py
+++ b/python/aitemplate/backend/rocm/tensor/argmax.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import argmax_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import argmax_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/tensor/batch_gather.py b/python/aitemplate/backend/rocm/tensor/batch_gather.py
index 8deff3144..8ac8a78a7 100644
--- a/python/aitemplate/backend/rocm/tensor/batch_gather.py
+++ b/python/aitemplate/backend/rocm/tensor/batch_gather.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import batch_gather_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import batch_gather_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate.py b/python/aitemplate/backend/rocm/tensor/concatenate.py
index ac56c8dde..730037eec 100644
--- a/python/aitemplate/backend/rocm/tensor/concatenate.py
+++ b/python/aitemplate/backend/rocm/tensor/concatenate.py
@@ -16,9 +16,9 @@
 ROCM concatenate function
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import concatenate_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import concatenate_common
 
 
 @registry.reg("rocm.concatenate.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
index 3b2c8f93e..4806ca919 100644
--- a/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
+++ b/python/aitemplate/backend/rocm/tensor/concatenate_tanh.py
@@ -17,8 +17,8 @@
 """
 import jinja2
 
-from ... import registry
-from . import concatenate
+from aitemplate.backend import registry
+from aitemplate.backend.rocm.tensor import concatenate
 
 TANH_DEF = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/tensor/dynamic_slice.py b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
index 4f39785d3..1df53d7dd 100644
--- a/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
+++ b/python/aitemplate/backend/rocm/tensor/dynamic_slice.py
@@ -16,9 +16,9 @@
 Dynamic slice ROCM implementation.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("rocm.dynamic_slice.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/expand.py b/python/aitemplate/backend/rocm/tensor/expand.py
new file mode 100644
index 000000000..4e3951156
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand.py
@@ -0,0 +1,308 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+expand op general ROCM implementation with complete dynamic shape support
+"""
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.rocm.tensor import expand_static_shape  # noqa: F401
+
+
+@registry.reg("rocm.expand.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    if func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]:
+        func = registry.get("rocm.expand.static.func_decl")
+        return func(func_attrs)
+    x = func_attrs["inputs"][0]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    dt = x.dtype()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(dt, None)
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_name,  # name of the function
+        dtype=dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float ))
+        index_type=index_type,
+    )
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+  const void* src,
+  const {{index_type}}* input_dims,
+  const {{index_type}} input_rank,
+  void* dst,
+  {{index_type}}* output_dims, // written to ( runtime shape inference )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types,
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+using bfloat16 = hip_bfloat16;
+
+{% if index_type=="int64_t" %}
+#define DIM_TYPE_ADD 0l
+#define DIM_TYPE_EXPAND 1l
+#define DIM_TYPE_KEEP 2l
+
+#define MAX_THREADS_PER_BLOCK 1024l
+#define MAX_BLOCKS 65535l
+#define MAX_X_BLOCKS 2147483647l
+{% else %}
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define MAX_BLOCKS 65535
+#define MAX_X_BLOCKS 2147483647
+{% endif %}
+
+// integer ceil division
+#define INT_CEIL_DIV(a,b) (((a) + (b) - 1) / (b))
+#define INT_MIN(a,b) ((a) < (b)? (a) : (b))
+
+/**
+ * Sequential write expand kernel.
+ * This kernel deals with the general case ( strided copy ).
+ * It relies heavily on L2 cache for scattered read optimization and
+ * writes sequentially.
+ */
+__global__ void {{func_name}}_sequential_write_kernel(
+
+  const {{dtype}}* src, // source tensor
+  {{dtype}}* dst, // destination tensor
+  const {{index_type}} dst_numel // number of elements in dst
+  {% for i in range(output_rank) %}
+        ,const {{index_type}} output_strides_{{i}} // Stride for writing dimension {{i}} to dst
+        ,const {{index_type}} read_strides_{{i}} // Stride for reading dimension {{i}} from src
+  {% endfor %}
+  ) {
+    // determine our range of elements to read
+    {{index_type}} write_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    const {{index_type}} grid_stride = gridDim.x*blockDim.x;
+    for (;write_idx<dst_numel;write_idx += grid_stride) {
+      {{index_type}} read_idx = 0;
+      {{index_type}} remaining_idx = write_idx; // Used to calculate remainder
+      {% for i in range(output_rank) %}
+          read_idx += (remaining_idx / output_strides_{{i}}) * read_strides_{{i}};
+          remaining_idx %= output_strides_{{i}};
+      {% endfor %}
+      dst[write_idx] = src[read_idx];
+    }
+}
+
+/**
+ * Expand Operator entry point with support for dynamic shapes
+ */
+void {{func_name}} (
+  const void* src, // input tensor
+  const {{index_type}}* input_dims, // input dimensions ( passed by value )
+  const {{index_type}} input_rank,
+  void* dst, // output tensor
+  {{index_type}}* output_dims, // output dimensions ( passed by value )
+  const {{index_type}} output_rank,
+  const {{index_type}}* output_dim_types, // Output dim types ( length=output_rank ). 2 = keep dimension, 1 = expand dimension, 0 = add dimension
+  hipStream_t stream)
+{
+  // Calculate number of input elements
+  {{index_type}} input_numel = 1;
+  {{index_type}} i;
+  for (i = 0; i < input_rank; ++i) {
+    input_numel *= input_dims[i];
+  }
+  if (input_numel==0) {
+    return;
+  }
+  {{index_type}} input_dim_pos = 0;
+
+  // Calculate number of output dimensions
+  {{index_type}} output_numel = 1;
+  for (i = 0; i < output_rank; ++i) {
+    output_numel *= output_dims[i];
+  }
+  if (output_numel==0) {
+    return;
+  }
+  // Determine stride for each input dimension
+  {{index_type}} input_strides[input_rank];
+  input_strides[input_rank-1] = 1;
+  for (i=input_rank-2;i>=0;--i) {
+    input_strides[i] = input_strides[i+1]*input_dims[i+1];
+  }
+  // Determine stride for each output dimension
+  {{index_type}} output_strides[output_rank];
+  output_strides[output_rank-1] = 1;
+  for (i=output_rank-2;i>=0;--i) {
+    output_strides[i] = output_strides[i+1]*(output_dims[i+1]);
+  }
+
+  // Determine read strides for each output dimension
+  // (0 for expand or add dims, otherwise the stride of
+  // of the corresponding input dim)
+  {{index_type}} read_strides[output_rank];
+
+  input_dim_pos = 0;
+  for (i = 0; i < output_rank; ++i) {
+    {{index_type}} dim_type =  output_dim_types[i];
+    if (dim_type == DIM_TYPE_KEEP ) { // keep
+      read_strides[i] = input_strides[input_dim_pos++];
+    } else {
+      read_strides[i] = 0;
+      if (dim_type==DIM_TYPE_EXPAND) {
+        input_dim_pos++;
+      }
+    }
+  }
+  assert(input_dim_pos==input_rank);
+
+  // Calculating tail dimension in order to determine whether we can do sequential batching
+  {{index_type}} tail_dim = 1;
+  for (i = output_rank-1; i >= 0; --i) {
+      if (output_dim_types[i]!=DIM_TYPE_KEEP) {
+         break;
+      }
+      tail_dim *= output_dims[i];
+  }
+
+  // determine ROCM kernel grid layout. Tuning numbers determined experimentally
+  {{index_type}} thread_size_x = INT_MIN(output_numel, MAX_THREADS_PER_BLOCK); // more threads per block maximize L1 cache utilization
+  {{index_type}} block_size_x = INT_MIN(INT_CEIL_DIV(output_numel, thread_size_x), 4096l ); //
+
+  // for very large dimensions, we rely on grid-stride loop and save the block launch overhead
+  dim3 dimGrid(block_size_x, 1, 1);
+  dim3 dimBlock(thread_size_x, 1, 1);
+  {{func_name}}_sequential_write_kernel<<<dimGrid,dimBlock,0,stream>>>(
+      static_cast<const {{dtype}}*>(src),
+      static_cast<{{dtype}}*>(dst),
+      output_numel
+      {% for i in range(output_rank) %}
+        ,output_strides[{{i}}]
+        ,read_strides[{{i}}]
+      {% endfor %}
+  );
+}
+"""
+)
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    rocm_spec: ROCMSpec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype.get(x.dtype(), None)
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    index_type = rocm_spec.dtype_to_backend_dtype.get(
+        func_attrs.get("index_type", "int64"), None
+    )
+    assert index_type is not None
+
+    input_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in xshape]
+    )
+    output_dims = ",".join(
+        [f"static_cast<{index_type}>(" + dim._attrs["name"] + ")" for dim in yshape]
+    )
+    input_rank = len(xshape)
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "input_dims": input_dims,  # list of input dimensions (as string of comma-separated variable names )
+        "output_dims": output_dims,  # output dimensions (as string of comma-separated variable names)
+        "input_rank": input_rank,  # number of input dimensions
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float ))
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+    }
+
+
+@registry.reg("rocm.expand.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+    else:
+        func = registry.get("rocm.expand.static.gen_function")
+        return func(func_attrs)
+
+
+@registry.reg("rocm.expand.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    if not (
+        func_attrs["optimize_fixed_dims"] and func_attrs["non_head_dims_are_fixed"]
+    ):
+        return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+    else:
+        func = registry.get("rocm.expand.static.func_call")
+        return func(func_attrs, indent)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}const {{index_type}} input_dims[] = { {{input_dims}} };
+    {{indent}}{{index_type}} output_dims[] = { {{output_dims}} };
+    {{indent}}const {{index_type}} output_dim_types[] = { {{dim_types}} };
+    {{indent}}{{func_name}}(
+    {{indent}}    {{src}},
+    {{indent}}    input_dims,
+    {{indent}}    {{input_rank}},
+    {{indent}}    {{dst}},
+    {{indent}}    output_dims,
+    {{indent}}    {{output_rank}},
+    {{indent}}    output_dim_types,
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/expand_static_shape.py b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
new file mode 100644
index 000000000..730cbbcef
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/expand_static_shape.py
@@ -0,0 +1,386 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Specialized and optimized ROCM kernel declarations for the `expand` operator
+dealing with the most common case that the input and target shapes are known at compile time,
+with the possible exception of leading dimensions.
+
+"""
+
+import math
+import os
+from itertools import accumulate
+from operator import mul
+from typing import Any, Dict, List
+
+import jinja2
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.target import Target
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+
+
+@registry.reg("rocm.expand.static.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    return FUNC_DECL_TEMPLATE.render(create_template_args(func_attrs))
+
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream);
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+#include <limits>
+#include <stdexcept>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+#include "logging.h"
+
+
+using bfloat16 = hip_bfloat16;
+
+#define DIM_TYPE_ADD 0
+#define DIM_TYPE_EXPAND 1
+#define DIM_TYPE_KEEP 2
+
+#define MAX_THREADS_PER_BLOCK 1024l
+// integer ceil division
+#define INT_CEIL_DIV(a, b) (((a) + (b)-1) / (b))
+
+// Maximum amount of shared memory that the repeat copy kernel(s) should use.
+// (used within repeat.cuh, included below )
+// Note: 44kb is sufficient in this case to fully utilize the GPU parallelism
+#define SHM_MAX 1024 * 44
+namespace{
+{{custom_libs}}
+
+/**
+ * Get read base offset (e.g. excluding tail offset) in the middle part, given a write offset
+ * into the middle part
+ */
+__forceinline__ __device__ {{index_type}} {{func_name}}_get_read_offset(const {{index_type}} write_offset) {
+    {{index_type}} read_idx = 0;
+    {{index_type}} remaining_write_idx = write_offset; // assert < {{mid_size*tail_size}} ( i.e. < mid_size*tail_size)
+    {% for i in range(head_dim_count, head_dim_count+mid_dim_count-1) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+        remaining_write_idx %= {{output_strides[i]}}l;
+    {% endfor %}
+    {% for i in range(head_dim_count+mid_dim_count-1, head_dim_count+mid_dim_count) %}
+        {% if read_strides[i]!=0 %}
+    read_idx += (remaining_write_idx / {{output_strides[i]}}l) * {{read_strides[i]}}l;
+        {% endif %}
+    {% endfor %}
+    return read_idx;
+}
+
+/**
+ *  Copies tail elements from a contiguous source memory region into a contiguous target memory region
+ *  Using a grid-stride loop and the vectorized dtype
+ *
+ * see https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ */
+__forceinline__ __device__ void tail_copy(
+        const {{dtype}} * const src, // base src tensor memory pointer
+        const {{index_type}} read_offset, // base offset into src, via {{dtype}}-typed indexing
+        {{dtype}} * const dst,  // base destination tensor memory pointer
+        const {{index_type}} write_offset, // Base offset into dst via {{dtype}}-typed indexing
+        const {{index_type}} block_thread_index,
+        const {{index_type}} block_thread_count,
+        const {{index_type}} copy_numel
+    ) {
+    for ({{index_type}} i=block_thread_index;i<copy_numel;i+=block_thread_count) {
+        dst[write_offset+i] = src[read_offset+i];
+    }
+}
+
+/**
+ * Implement the "middle" part of the kernel, dealing with strided reads/writes.
+ * Also utilizes grid-stride loop for efficiency and flexibility
+ * see
+ * * https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+ * * https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#coalesced-access-to-global-memory
+ * * and https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/#strided-accesses
+ * for a more detailed explanation of the reasons for the choice of this specific form.
+ *
+ * Performance notes:
+ *
+ * It is critical to calculate the block_thread_index passed to tail_copy(..) based on
+ * the x-dimension of the launch grid, in order to benefit from Warp memory access coalescing.
+ *
+ */
+__global__ void expand_strided_copy(
+
+  const {{dtype}}* const src, // source tensor
+  {{dtype}}* const dst // destination tensor
+  ) {
+    // determine our range of elements to read
+    const {{index_type}} write_offset = (blockDim.y * blockIdx.y + threadIdx.y) * {{tail_size}}l;
+    const {{index_type}} read_offset = {{func_name}}_get_read_offset(write_offset);
+    const {{index_type}} grid_size_x = gridDim.x*blockDim.x;
+    const {{index_type}} grid_size_y = gridDim.y*blockDim.y;
+    const {{index_type}} step_size_y = grid_size_y * {{tail_size}}l;
+    const {{index_type}} thread_idx_x = blockDim.x * blockIdx.x + threadIdx.x;
+    for ({{index_type}} i=write_offset;i<{{mid_size*tail_size}}l;i+=step_size_y) {
+        tail_copy(src, read_offset, dst, write_offset, thread_idx_x, grid_size_x, {{tail_size}}l);
+    }
+
+}
+}
+/**
+ * Expand Operator entry point, optimized for static shapes. Only the head dimension may be dynamic.
+ */
+void {{func_name}} (
+  const {{dtype}}* const src, // input tensor
+  {{dtype}}* const dst, // output tensor
+  const {{index_type}} head_size, // how many times to repeat the first part of the tensor.
+  hipStream_t stream)
+{
+  if ((({{mid_size*tail_size}})==0) || (head_size==0)) {
+    return;
+  }
+  {% if mid_dim_count>0 %}
+  // we have middle dimensions which involve non-contiguous reads
+  // so we need to invoke the middle kernel
+  dim3 dimGrid({{grid_blocks_x}}, {{grid_blocks_y}});
+  dim3 dimBlock({{grid_threads_x}}, {{grid_threads_y}});
+  expand_strided_copy<<<dimGrid,dimBlock,0,stream>>>(src, dst);
+  if (head_size>1l) {
+     // now repeat copy what we already built once, multiple times into the rest of the output tensor
+     cuda_repeat_head(dst, {{mid_size*tail_size}}l*sizeof({{dtype}}),head_size-1, stream);
+  }
+  {% else %}
+    // we have no middle dimensions, so strided copy is unneccessary.
+    // All we need to do is repeatedly copy the source multiple times
+    // repeat the entire thing a dynamic number of times ( e.g. head_size times )
+    cuda_repeat_src(src, dst, {{mid_size*tail_size}}l*sizeof({{dtype}}), head_size, stream);
+  {% endif %}
+}
+"""
+)
+
+
+def _ceil(num: float) -> int:
+    return int(math.ceil(num))
+
+
+def create_template_args(
+    func_attrs: Dict[str, Any], indent: str = "  "
+) -> Dict[str, Any]:
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+    dst = y._attrs["name"]
+    src = x._attrs["name"]
+    func_name = func_attrs["name"]
+    # Efficient vectorized & buffered repeat copy implementation,
+    # even for odd shapes
+    custom_libs = Target.current().get_custom_libs(
+        os.path.dirname(__file__), "repeat.h"
+    )
+    rocm_spec = ROCMSpec()
+    dtype = rocm_spec.dtype_to_backend_dtype[x.dtype()]
+    assert (
+        dtype is not None
+    ), f"ROCM implementation does not support dtype {x.dtype()} (yet)"
+    dtype2 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 2, None)
+    dtype4 = rocm_spec.type_for_size.get(rocm_spec.sizeof_types[dtype] * 4, None)
+    xshape = x._attrs["shape"]
+    yshape = y._attrs["shape"]
+    dim_types: List[ExpandDimensionType] = func_attrs["dim_types"]
+    index_type = "int64_t"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in xshape
+    ), "All input shapes need to be fixed"
+    assert all(
+        dim.lower_bound() == dim.upper_bound() for dim in yshape
+    ), "All output shapes need to be fixed"
+
+    # Calculate number of times we can repeatedly copy the entire result, based on how many add, expand and singleton dimensions
+    # we have at the start
+    head_size_lower = 1  # Number of times we can batch-repeat the entire result in an efficient batch-copying manner
+    head_size_upper = 1
+    head_dim_count = 0  # Number of head dimensions
+
+    for dim_type, dim in zip(func_attrs["dim_types"], yshape):
+        if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        head_size_lower *= dim.lower_bound()
+        head_size_upper *= dim.upper_bound()
+        head_dim_count += 1
+
+    # Create a symbolic term for calculating head size ( e.g. repeat count )
+    if head_size_lower == head_size_upper:
+        head_size_symbolic = f"{head_size_upper}l"
+    else:
+        head_size_symbolic = "*".join(
+            [
+                f"static_cast<{index_type}>(" + dim._attrs["name"] + ")"
+                for dim in yshape[:head_dim_count]
+            ]
+        )
+
+    # Calculate number of tail elements, e.g. number of elements we can batch-copy in the inner loop
+    # via effective sequential reads & writes
+    tail_dim_count = 0  # number of tail dimensions
+    tail_size = 1  # Number of the elements in all these  tail dimensions
+    for dim_type, dim in reversed(
+        list(zip(dim_types[head_dim_count:], yshape[head_dim_count:]))
+    ):
+        if dim_type != ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+            break
+        tail_dim_count += 1
+        tail_size *= dim.lower_bound()
+
+    input_strides = list(
+        reversed(
+            list(accumulate([1] + [d.lower_bound() for d in reversed(xshape)], mul))
+        )
+    )
+    output_strides = list(
+        reversed(
+            list(
+                accumulate(
+                    [1] + [d.lower_bound() for d in reversed(yshape[head_dim_count:])],
+                    mul,
+                )
+            )
+        )
+    )
+
+    output_numel = output_strides[
+        0
+    ]  # this does not include the number of elements obtained from head repetitions
+    # since we have excluded head dimensions above
+    input_numel = input_strides[0]
+    if tail_size > 0:
+        mid_size = output_numel // tail_size
+    else:
+        mid_size = 0
+    mid_dim_count = len(yshape) - tail_dim_count - head_dim_count
+    if input_numel > 0:
+        mid_expansion_rate = mid_size * tail_size // input_numel
+    else:
+        mid_expansion_rate = 1
+
+    # remove the first dimension, which is the total number of elements
+    # and prepend the head_dims with stride 0
+    output_strides = [0] * head_dim_count + output_strides[1:]
+    input_strides = input_strides[1:]
+
+    input_stride_pos = 0
+    read_strides = [0] * len(yshape)
+    for i in range(len(yshape)):
+        if dim_types[i] == ExpandDimensionType.ADD_DIM:
+            continue
+        if dim_types[i] == ExpandDimensionType.KEEP_DIM:
+            read_strides[i] = input_strides[input_stride_pos]
+        # For keep dim, read stride remains at zero
+        input_stride_pos += 1
+
+    assert input_stride_pos == len(
+        xshape
+    ), "Incorrect number of keep and expand dims. Something went wrong."
+    output_rank = len(yshape)
+    dim_types = ",".join([str(int(dt)) for dt in func_attrs["dim_types"]])
+
+    # If tail size is aligned to 2 or 4 elements, we can vectorize reads/writes
+    # Note: Further vectorization not easily possible, given that it could happen that
+    # the read offset and the write offset can get different alignments within the expand op
+    #
+    if (tail_size % 4 == 0) and (dtype4 is not None):
+        dtype = dtype4
+        tail_size = tail_size // 4
+        output_strides = [s // 4 for s in output_strides]
+        read_strides = [s // 4 for s in read_strides]
+    elif tail_size % 2 == 0:
+        dtype = dtype2
+        tail_size = tail_size // 2
+        output_strides = [s // 2 for s in output_strides]
+        read_strides = [s // 2 for s in read_strides]
+
+    grid_blocks_x = 1
+    grid_threads_x = max(1, min(tail_size, 64))
+    max_y_threads = 1024 // grid_threads_x  # guaranteed to be >= 1
+    grid_threads_y = max(
+        1, min(max_y_threads, mid_size)
+    )  # so that  mid_grid_threads_x*max_x_threads <= 1024
+    grid_blocks_y = _ceil(mid_size / grid_threads_y)
+
+    if dtype == "bfloat16":
+        # bfloat16 is not available in model-generated.h as a type,
+        # so we can either just declare the input to be void*
+        # or  just use the fact that we don't care about how to interpret the value
+        # and just treat it like every other 16 bit type.
+        dtype = "half"
+
+    return {
+        "func_name": func_name,  # name of the function
+        "dst": dst,  # name of the output tensor (of type dtype*)
+        "src": src,  # name of the input tensor (of type dtype*)
+        "output_strides": output_strides,  # list of output stride values
+        "read_strides": read_strides,  # list of read stride values
+        "tail_dim_count": tail_dim_count,  # number of tail dimensions
+        "tail_size": tail_size,  # number of elements in all these tail dimensions
+        "head_dim_count": head_dim_count,  # number of head dimensions
+        "head_size": head_size_symbolic,  # number of elements in all these head dimensions
+        "mid_dim_count": mid_dim_count,
+        "mid_size": mid_size,
+        "mid_expansion_rate": mid_expansion_rate,  # How many times do we read the input for the middle
+        "output_rank": output_rank,  # number of output dimensions
+        "dim_types": dim_types,  # list of output dimension types: 2 = keep, 1 = expand, 0 = add
+        "dtype": dtype,  # data type of the input and output tensor elements ( valid ROCM C type like float )
+        "indent": indent,  # indentation for the function call template,
+        "index_type": index_type,
+        "grid_blocks_y": grid_blocks_y,  # number of y grid blocks in the strided copy kernel
+        "grid_blocks_x": grid_blocks_x,  # number of x grid blocks in the strided copy kernel
+        "grid_threads_y": grid_threads_y,  # number of y threads per grid block in the strided copy kernel
+        "grid_threads_x": grid_threads_x,  # number of x threads per grid block in the strided copy kernel
+        "custom_libs": custom_libs,  # custom library path, e.g. path to repeat.h
+    }
+
+
+@registry.reg("rocm.expand.static.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    return SRC_TEMPLATE.render(create_template_args(func_attrs, "    "))
+
+
+@registry.reg("rocm.expand.static.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent: str = "  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(create_template_args(func_attrs, indent))
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+    {
+    {{indent}}{{func_name}}(
+    {{indent}}    static_cast<{{dtype}}*>({{src}}),
+    {{indent}}    static_cast<{{dtype}}*>({{dst}}),
+    {{indent}}    {{head_size}},
+    {{indent}}    stream);
+    }
+    """
+)
diff --git a/python/aitemplate/backend/rocm/tensor/full.py b/python/aitemplate/backend/rocm/tensor/full.py
new file mode 100644
index 000000000..da462a93f
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/full.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Any, Dict
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+
+HIP_HEADER_FILES = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+
+CONSTANT_TEMPLATE = jinja2.Template(
+    """
+#define N_THREADS_PER_BLOCK 256
+
+const int N_ELEMENTS_PER_THREAD = sizeof({{read_t}}) / sizeof({{data_t}});
+    """
+)
+
+
+FUNC_DECL = jinja2.Template(
+    """
+void invoke_{{func_name}}(
+    void*,  /* output */
+    {{prefix}}Stream_t  /* stream */
+);
+    """
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}invoke_{{func_name}}(
+{{indent}}    {{output}},
+{{indent}}    stream
+{{indent}});
+    """
+)
+
+
+FUNC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+namespace {
+
+{{constant}}
+
+__global__  void full(
+    {{read_type}}* output,
+    {{index_type}} num_elements
+) {
+  const {{index_type}} idx = (blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx * N_ELEMENTS_PER_THREAD >= num_elements) {
+    return;
+  }
+
+  {{read_type}} tmp;
+  {{data_type}}* p = reinterpret_cast<{{data_type}}*>(&tmp);
+
+  #pragma unroll
+  for (int i=0; i < N_ELEMENTS_PER_THREAD; i++) {
+      p[i] = ({{data_type}}) ({{fill_value}});
+  }
+
+  output[idx] = tmp;
+}
+
+}  // namespace
+
+void invoke_{{func_name}}(
+    void* output,
+    {{prefix}}Stream_t stream
+){
+    int grid_size = static_cast<int>(std::ceil(static_cast<double>({{num_elements}}) / N_ELEMENTS_PER_THREAD / N_THREADS_PER_BLOCK));
+    full<<<grid_size, N_THREADS_PER_BLOCK, 0, stream>>>(reinterpret_cast<{{read_type}}*> (output), {{num_elements}});
+}
+    """
+)
+
+
+@registry.reg("rocm.full.gen_function")
+def gen_function(func_attrs: Dict[str, Any]) -> str:
+    y = func_attrs["outputs"][0]
+    backend_spec = ROCMSpec()
+
+    # fill the maximum output Tensor size with the fill_value
+    # any shape within the maximum bounds will be a subset
+    num_elements = 1
+    for dim in y.shape():
+        num_elements *= dim.upper_bound()
+
+    dtype = y.dtype()
+    data_type = backend_spec.dtype_to_backend_type(dtype)
+    read_type = backend_spec.get_elementwise_read_backend_type(num_elements, dtype)
+
+    return FUNC_TEMPLATE.render(
+        header_files=backend_spec.header_src_template.render(
+            extra_header=HIP_HEADER_FILES
+        ),
+        constant=CONSTANT_TEMPLATE.render(
+            read_t=read_type,
+            data_t=data_type,
+        ),
+        func_name=func_attrs["name"],
+        read_type=read_type,
+        data_type=data_type,
+        index_type=backend_spec.index_type,
+        fill_value=func_attrs["fill_value"],
+        num_elements=num_elements,
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_decl")
+def gen_function_decl(func_attrs: Dict[str, Any]) -> str:
+    backend_spec = ROCMSpec()
+    return FUNC_DECL.render(
+        func_name=func_attrs["name"],
+        prefix=backend_spec.prefix,
+    )
+
+
+@registry.reg("rocm.full.func_call")
+def gen_function_call(func_attrs: Dict[str, Any], indent="  ") -> str:
+    return FUNC_CALL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        output=func_attrs["outputs"][0]._attrs["name"],
+        indent=indent,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/identity.py b/python/aitemplate/backend/rocm/tensor/identity.py
new file mode 100644
index 000000000..d5d59b4cf
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/identity.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+ROCM identity function
+"""
+
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import identity_common
+
+
+EXTRA_HEADERS = jinja2.Template(
+    """
+#include <hip/hip_runtime.h>
+    """
+)
+
+
+@registry.reg("rocm.identity.func_decl")
+def gen_function_decl(func_attrs):
+    """Generate function declaration.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    Returns
+    -------
+    str
+        Rendered function declaration.
+    """
+    return identity_common.gen_function_decl(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
+
+
+@registry.reg("rocm.identity.gen_function")
+def gen_function(func_attrs):
+    """Generates function body.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+
+    Returns
+    -------
+    str
+        Rendered function body.
+    """
+    return identity_common.gen_function(
+        func_attrs=func_attrs,
+        backend_spec=ROCMSpec(),
+        extra_headers=EXTRA_HEADERS.render(),
+    )
+
+
+@registry.reg("rocm.identity.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """Generates function call.
+
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Stores the operation attributes.
+    indent : str, optional
+        Indent for template, by default "  ".
+
+    Returns
+    -------
+    str
+        Rendered function call.
+    """
+    return identity_common.gen_function_call(
+        func_attrs=func_attrs, backend_spec=ROCMSpec()
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/index_select.py b/python/aitemplate/backend/rocm/tensor/index_select.py
new file mode 100644
index 000000000..b0d9f2999
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/index_select.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define index_select codegen and ROCM kernel
+
+Example input:
+ - tensor of shape (6,5,4,3,2)
+ - dim = 2 (0->6, 1->5, 2->4, 3->3, 4->2)
+ - dim_len = 4
+ - dim_idxs = [1,2] (numbers taken from interval [0,3])
+ - dim_idx_len = 2
+ - num_before = 6*5
+ - num_after = 3*2
+
+Output tensor has dim (6,5,2,3,2) i.e.
+it has 6*5 (num_before) sets of 2 (dim_idx_len) sets of  3*2 (num_after) elements.
+
+Assuming contiguous memory layout of the original tensor (which seems like a base check for bad_tensor),
+the first few elements to be selected are at positions [6-11], [12-17] corresponding to dim_idxs values 1 and 2.
+Generalized to:
+    - Divide global thread_idx by num_after and calculate start of innermost set as the remainder
+    - Further divide by dim_idx_len and calculate start of next outer set as the remainder
+    - Use the final value as the offset for the outer most set
+    - Compute offset and assign to the element denoted by thread idx
+    - increment idx by grid stride
+
+Num threads = 256.
+Blocks are(N + threads - 1) / threads;
+
+"""
+import jinja2
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+
+
+header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+"""
+
+FUNC_DECL_TEMPLATE = jinja2.Template(
+    """
+void {{func_name}}(
+    {{input_type}}* /*output*/,
+    const {{input_type}}* /*input*/,
+    const {{index_type}} /*dim*/,
+    const {{index_type}} /*dim_len*/,
+    const {{index_type}}* /*dim_idxs*/,
+    const {{index_type}} /*dim_idxs_len*/,
+    const {{index_type}} /*num_before*/,
+    const {{index_type}} /*num_after*/,
+    hipStream_t /*stream*/
+    );
+"""
+)
+
+SRC_TEMPLATE = jinja2.Template(
+    """
+{{header_files}}
+
+__global__ void index_select_kernel(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    const {{index_type}} N
+) {
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+    #pragma unroll
+    for(auto i = idx; i<N; i+=gridDim.x*blockDim.x) {
+        auto res = i;
+        auto k = i%num_after;
+        res = res/num_after;
+        auto j = res%dim_idxs_len;
+        res = res/dim_idxs_len;
+        auto skip = res*dim_len*num_after + (dim_idxs[j]*num_after) + k;
+        output[i] = input[skip];
+    }
+
+}
+
+void {{func_name}}(
+    {{input_type}}* output,
+    const {{input_type}}* input,
+    const {{index_type}} dim,
+    const {{index_type}} dim_len,
+    const {{index_type}}* dim_idxs,
+    const {{index_type}} dim_idxs_len,
+    const {{index_type}} num_before,
+    const {{index_type}} num_after,
+    hipStream_t stream
+    ) {
+
+    {{index_type}} N =  num_before*dim_idxs_len*num_after;
+    const {{index_type}} threads  = 256;
+    auto blocks = (N + threads - 1) / threads;
+
+    index_select_kernel<<<blocks, threads, 0, stream>>>(output, input, dim, dim_len, dim_idxs,
+        dim_idxs_len, num_before, num_after, N);
+}
+"""
+)
+
+
+FUNC_CALL_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{
+{{indent}}  {{index_type}} x_dims[] = {
+{{indent}}      {{x_dims}}
+{{indent}}  };
+{{indent}}  {{index_type}} num_before = 1;
+{{indent}}  {{index_type}} num_after = 1;
+{{indent}}  {{index_type}} dim_len = x_dims[{{dim}}];
+{{indent}}  for(auto i=0;i<{{dim}};i++) {
+{{indent}}   num_before *= x_dims[i];
+{{indent}}  }
+{{indent}}  for(auto i={{dim}}+1;i<sizeof(x_dims)/sizeof(x_dims[0]);i++) {
+{{indent}}   num_after *= x_dims[i];
+{{indent}}  }
+{{indent}}  {{func_name}}(
+{{indent}}      {{output}},
+{{indent}}      {{input}},
+{{indent}}      {{dim}},
+{{indent}}      dim_len,
+{{indent}}      {{dim_idxs}},
+{{indent}}      {{dim_idxs_len}},
+{{indent}}      num_before,
+{{indent}}      num_after,
+{{indent}}      stream
+{{indent}}  );
+{{indent}}}
+"""
+)
+
+
+@registry.reg("rocm.index_select.gen_function")
+def gen_function(func_attrs) -> str:
+    """
+    Generate function body
+
+    Returns
+    -------
+    str
+        The function body string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    y = func_attrs["outputs"][0]
+
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
+    if input_type != output_type:
+        raise TypeError("input type must equal to output type")
+
+    return SRC_TEMPLATE.render(
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+        func_name=func_attrs["name"],
+        header_files=header_files,
+    )
+
+
+@registry.reg("rocm.index_select.func_decl")
+def gen_function_decl(func_attrs) -> str:
+    """
+    Generate function declaration.
+
+    Returns
+    -------
+    str
+        The function declaration string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    return FUNC_DECL_TEMPLATE.render(
+        func_name=func_attrs["name"],
+        input_type=input_type,
+        index_type=backend_spec.index_type,
+    )
+
+
+@registry.reg("rocm.index_select.func_call")
+def gen_function_call(func_attrs, indent="  ") -> str:
+    """
+    Generate function call.
+
+    Returns
+    -------
+    str
+        The function call string
+    """
+    backend_spec = ROCMSpec()
+    x = func_attrs["inputs"][0]
+    dim_idxs = func_attrs["inputs"][1]
+    y = func_attrs["outputs"][0]
+    dim = func_attrs["dim"]
+
+    dtype = backend_spec.dtype_to_backend_type(func_attrs["inputs"][0]._attrs["dtype"])
+
+    dim_idxs_ptr = backend_spec.cast_to_ptr_template.render(
+        name=dim_idxs._attrs["name"],
+        dtype=backend_spec.index_type,
+    )
+    input_ptr = backend_spec.cast_to_ptr_template.render(
+        name=x._attrs["name"],
+        dtype=dtype,
+    )
+
+    output_ptr = backend_spec.cast_to_ptr_template.render(
+        name=y._attrs["name"],
+        dtype=dtype,
+    )
+
+    x_dims = ", ".join(dim._attrs["name"] for dim in x._attrs["shape"])
+    dim_idxs_len = dim_idxs._attrs["shape"][0]._attrs["name"]
+
+    return FUNC_CALL_TEMPLATE.render(
+        indent=indent,
+        index_type=backend_spec.index_type,
+        x_dims=x_dims,
+        input_type=dtype,
+        func_name=func_attrs["name"],
+        output=output_ptr,
+        input=input_ptr,
+        dim=dim,
+        dim_idxs=dim_idxs_ptr,
+        dim_idxs_len=dim_idxs_len,
+    )
diff --git a/python/aitemplate/backend/rocm/tensor/permute021.py b/python/aitemplate/backend/rocm/tensor/permute021.py
index 8dc0d1e40..afcba7883 100644
--- a/python/aitemplate/backend/rocm/tensor/permute021.py
+++ b/python/aitemplate/backend/rocm/tensor/permute021.py
@@ -16,9 +16,9 @@
 permute021 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute021_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute021_common
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -30,7 +30,7 @@
 
 
 @registry.reg("rocm.permute021.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -38,8 +38,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -49,8 +47,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute021_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         ROCMSpec(),
     )
diff --git a/python/aitemplate/backend/rocm/tensor/permute0213.py b/python/aitemplate/backend/rocm/tensor/permute0213.py
new file mode 100644
index 000000000..efeb759fb
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/permute0213.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+permute0213 for rocm
+"""
+
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute0213_common
+
+# pylint: disable=C0301,W0613,W0612
+
+Header_files = """
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include "library/include/ck/library/utility/host_tensor.hpp"
+"""
+
+
+@registry.reg("rocm.permute0213.gen_function")
+def gen_function(func_attrs, template_path):
+    """
+    Parameters
+    ----------
+    func_attrs : Dict[str, Any]
+        Attributes from Operator
+    template_path : str
+        path to library used
+
+    Returns
+    -------
+    str
+        Source code for function generated.
+    """
+    return permute0213_common.gen_function(
+        func_attrs,
+        template_path,
+        Header_files,
+        ROCMSpec(),
+    )
+
+
+@registry.reg("rocm.permute0213.func_decl")
+def gen_function_decl(func_attrs):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+
+    Returns
+    -------
+    str
+        Function declaration
+    """
+    return permute0213_common.gen_function_decl(func_attrs, ROCMSpec())
+
+
+@registry.reg("rocm.permute0213.func_call")
+def gen_function_call(func_attrs, indent="  "):
+    """
+    Parameters
+    ----------
+    func_attrs : dict
+        Attributes from Operator
+    indent : str, optional
+        Indentation for function call template, by default "  "
+
+    Returns
+    -------
+    str
+        Driver code for invoking call
+    """
+    return permute0213_common.gen_function_call(func_attrs, ROCMSpec(), indent)
diff --git a/python/aitemplate/backend/rocm/tensor/permute102.py b/python/aitemplate/backend/rocm/tensor/permute102.py
index df6fd3e82..7ab68b47f 100644
--- a/python/aitemplate/backend/rocm/tensor/permute102.py
+++ b/python/aitemplate/backend/rocm/tensor/permute102.py
@@ -16,9 +16,9 @@
 permute102 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute102_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute102_common
 
 # pylint: disable=C0301,W0613,W0612
 
@@ -30,7 +30,7 @@
 
 
 @registry.reg("rocm.permute102.gen_function")
-def gen_function(func_attrs, template_path, shape_eval_template, shape_save_template):
+def gen_function(func_attrs, template_path):
     """
     Parameters
     ----------
@@ -38,8 +38,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
         Attributes from Operator
     template_path : str
         path to library used
-    shape_eval_template : jinja template
-    shape_save_template : jinja template
 
     Returns
     -------
@@ -49,8 +47,6 @@ def gen_function(func_attrs, template_path, shape_eval_template, shape_save_temp
     return permute102_common.gen_function(
         func_attrs,
         template_path,
-        shape_eval_template,
-        shape_save_template,
         Header_files,
         ROCMSpec(),
     )
diff --git a/python/aitemplate/backend/rocm/tensor/permute210.py b/python/aitemplate/backend/rocm/tensor/permute210.py
index 31fdf6d91..b09bae490 100644
--- a/python/aitemplate/backend/rocm/tensor/permute210.py
+++ b/python/aitemplate/backend/rocm/tensor/permute210.py
@@ -16,9 +16,9 @@
 permute210 for rocm
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import permute210_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import permute210_common
 
 # pylint: disable=C0301,W0613,W0612
 
diff --git a/python/aitemplate/backend/rocm/tensor/repeat.h b/python/aitemplate/backend/rocm/tensor/repeat.h
new file mode 100644
index 000000000..1bebc0642
--- /dev/null
+++ b/python/aitemplate/backend/rocm/tensor/repeat.h
@@ -0,0 +1,188 @@
+/**
+
+  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+
+-
+
+Functions for repeating parts of a ROCM source tensor onto itself
+or into a target tensor.
+
+Used by expand_static_shape.py ( expand operator )
+
+*/
+/**
+ * ROCM Kernel to copy elements repeatedly from a source memory
+ * region to a target memory region.
+ */
+__global__ void repeat_head_kernel(
+    const int64_t* const src, ///< source memory region. Must be 8-byte aligned
+    int64_t* data,
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies) ///< How many times to repeat it all into data
+{
+  extern __shared__ int64_t
+      shared[]; // preallocated to blockDim.x elements, typically 32
+  const size_t stride_y = blockDim.y * gridDim.y;
+  const size_t stride_x = blockDim.x * gridDim.x;
+
+  // outer grid-stride loop
+  for (size_t ri = blockDim.x * blockIdx.x + threadIdx.x;
+       ri < head_mem_num_elements;
+       ri += stride_x) {
+    // read only with one thread per y dim
+    if (threadIdx.y == 0) {
+      shared[threadIdx.x] = src[ri];
+    }
+    __syncthreads(); // wait for shared memory to be populated
+    // inner grid-stride loop, write with all threads out of shared memory
+    size_t wi = threadIdx.y + blockDim.y * blockIdx.y;
+    for (; wi < num_repeat_copies; wi += stride_y) {
+      // Note that this ensures coalesced writes, due to consecutive write
+      // accesses of threads in a Warp
+      data[ri + head_mem_num_elements * wi] = shared[threadIdx.x];
+    }
+  }
+}
+
+/**
+ * Copy an 8-byte aligned memory region, which has a byte size that is a
+ * multiple of 8 into an 8-byte aligned target memory region efficiently. Calls
+ * into repeat_head_kernel ( see above )
+ *
+ **/
+__host__ hipError_t cuda_repeat_head_vectorized(
+    const int64_t* const src, ///< Source memory region. Must be 8-byte aligned
+    int64_t*
+        data, /**< target memory region. Must be 8-byte aligned and have space
+              for head_mem_num_elements*num_repeat_copies int64_t elements. */
+    size_t head_mem_num_elements, /**< How many 8 byte-sized elements to copy
+                                     from src */
+    size_t num_repeat_copies, ///< How many times to repeat it all into data
+    hipStream_t stream ///< ROCM stream
+) {
+  size_t threads_x = 64;
+  size_t threads_y = 1024 / threads_x;
+  size_t blocks_x = INT_CEIL_DIV(head_mem_num_elements, threads_x);
+  size_t blocks_y = INT_CEIL_DIV(num_repeat_copies, threads_y);
+  size_t serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks if necessary, so we do not exceed available shared
+  // memory
+  blocks_y = INT_CEIL_DIV(
+      blocks_y, serialization_level); // reduce thread count in y dimension
+                                      // first, e.g. sequentialized writes
+  serialization_level =
+      INT_CEIL_DIV(threads_x * sizeof(int64_t) * blocks_x * blocks_y, SHM_MAX);
+  // reduce number of blocks in x direction if this is not sufficient yet
+  blocks_x = INT_CEIL_DIV(blocks_x, serialization_level);
+  dim3 dimGrid(blocks_x, blocks_y);
+  dim3 dimBlock(threads_x, threads_y);
+  repeat_head_kernel<<<
+      dimGrid,
+      dimBlock,
+      threads_x * sizeof(int64_t),
+      stream>>>(src, data, head_mem_num_elements, num_repeat_copies);
+  return hipPeekAtLastError();
+}
+
+/**
+ * Repeatedly copy the beginning (head) section of a memory region an additonal
+ * num_repeat_copies times nto the memory region directly following that head,
+ * such that the end result will have this head data
+ * repeated 1+num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_head(
+    void* data, ///< pointer to ROCM memory of size (at least)
+                ///< head_mem_bytes*(num_repeat_copies+1)
+    const size_t head_mem_bytes, ///< How many bytes to repeat
+    size_t num_repeat_copies, ///< How many times to repeat it (in addition to
+                              ///< the existing head data)
+    hipStream_t stream ///< ROCM Stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0)
+    return res;
+  if ((head_mem_bytes % 8) == 0) {
+    // no need to double memory any further if it is 64-bit aligned
+    res = cuda_repeat_head_vectorized(
+        static_cast<const int64_t* const>(data),
+        static_cast<int64_t*>(data) + (head_mem_bytes / 8),
+        head_mem_bytes / 8,
+        num_repeat_copies,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+  } else {
+    res = hipMemcpyAsync(
+        static_cast<void*>(static_cast<uint8_t*>(data) + head_mem_bytes),
+        data,
+        head_mem_bytes,
+        hipMemcpyDeviceToDevice,
+        stream);
+    if (res != hipSuccess) {
+      return res;
+    }
+    if (num_repeat_copies >= 2) {
+      // recurse
+      // we have already repeated 1 time, therefore the (num_repeat_copies-1)
+      res = cuda_repeat_head(
+          data, head_mem_bytes * 2, (num_repeat_copies - 1) / 2, stream);
+      if (res != hipSuccess) {
+        return res;
+      }
+      // deal with possible remainder
+      if (((num_repeat_copies - 1) % 2) == 1) {
+        res = hipMemcpyAsync(
+            static_cast<void*>(
+                static_cast<uint8_t*>(data) +
+                num_repeat_copies * head_mem_bytes),
+            data,
+            head_mem_bytes,
+            hipMemcpyDeviceToDevice,
+            stream);
+      }
+    }
+  }
+  return res;
+}
+
+/**
+ * Repeatedly copy a source memory region into a target memory region
+ * such that the end result will have the source data
+ * repeated num_repeat_copies
+ */
+__host__ hipError_t cuda_repeat_src(
+    const void* const src, ///< Source memory region (readonly)
+    void* data, ///< Destination memory region (read/write, size of at least
+                ///< num_repeat_copies*head_mem_bytes)
+    const size_t head_mem_bytes, ///< Size of source region to copy
+    size_t num_repeat_copies, ///< How many times to copy the data from source
+                              ///< into data
+    hipStream_t stream ///< ROCM stream to use
+) {
+  hipError_t res = hipSuccess;
+  if (num_repeat_copies == 0) {
+    return res;
+  }
+
+  res = hipMemcpyAsync(
+      data, src, head_mem_bytes, hipMemcpyDeviceToDevice, stream);
+  if ((res != hipSuccess) || (num_repeat_copies == 1)) {
+    return res;
+  }
+  return cuda_repeat_head(data, head_mem_bytes, num_repeat_copies - 1, stream);
+}
diff --git a/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
index 5405ae749..4dd31dc3f 100644
--- a/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/backend/rocm/tensor/slice_reshape_scatter.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_reshape_scatter_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_reshape_scatter_common
 
 TANH_DEF = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/tensor/slice_scatter.py b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
index 4641c0905..cf59b2ad5 100644
--- a/python/aitemplate/backend/rocm/tensor/slice_scatter.py
+++ b/python/aitemplate/backend/rocm/tensor/slice_scatter.py
@@ -16,9 +16,9 @@
 Slice scatter ROCM implementation.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import slice_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import slice_common
 
 
 @registry.reg("rocm.slice_scatter.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/split.py b/python/aitemplate/backend/rocm/tensor/split.py
index 1e545a2b2..dde07ea10 100644
--- a/python/aitemplate/backend/rocm/tensor/split.py
+++ b/python/aitemplate/backend/rocm/tensor/split.py
@@ -16,9 +16,9 @@
 ROCM split function
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import split_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import split_common
 
 
 @registry.reg("rocm.split.func_decl")
diff --git a/python/aitemplate/backend/rocm/tensor/topk.py b/python/aitemplate/backend/rocm/tensor/topk.py
index 038a4b361..590c5ef7e 100644
--- a/python/aitemplate/backend/rocm/tensor/topk.py
+++ b/python/aitemplate/backend/rocm/tensor/topk.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.tensor import topk_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.tensor import topk_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/upsample/__init__.py b/python/aitemplate/backend/rocm/upsample/__init__.py
index 500b24f81..3d822c1b0 100644
--- a/python/aitemplate/backend/rocm/upsample/__init__.py
+++ b/python/aitemplate/backend/rocm/upsample/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM upsampling module init
 """
-from . import upsampling2d, upsampling2d_add
+from aitemplate.backend.rocm.upsample import upsampling2d, upsampling2d_add
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d.py b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
index 92e48de06..319ca1eeb 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d.py
@@ -16,9 +16,9 @@
 ROCM codegen functions for unsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -67,7 +67,7 @@ def gen_function(
     exec_paths = ""
     for key in exec_path:
         program = upsampling2d_common.EXEC_TEMPLATE.render(dtype=input_type)
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
index cad144742..78089976d 100644
--- a/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
+++ b/python/aitemplate/backend/rocm/upsample/upsampling2d_add.py
@@ -16,9 +16,9 @@
 ROCM codegen functions for unsampling2d_add.
 """
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common import upsampling2d_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common import upsampling2d_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -34,7 +34,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -69,7 +69,7 @@ def gen_function(
         program = upsampling2d_common.EXEC_TEMPLATE.render(
             dtype=input_type, bias_add=True
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return upsampling2d_common.SRC_TEMPLATE.render(
         header_files=Header_Files,
diff --git a/python/aitemplate/backend/rocm/utils.py b/python/aitemplate/backend/rocm/utils.py
index 40c45fb6e..39cc9e0b4 100644
--- a/python/aitemplate/backend/rocm/utils.py
+++ b/python/aitemplate/backend/rocm/utils.py
@@ -21,14 +21,14 @@
 import shutil
 import tempfile
 
-from .. import registry
+from aitemplate.backend import registry
 
 # from . import extra_conv_emit, extra_cutlass_generator, extra_enum
 
 # pylint: disable=C0103,C0415,W0707
 
 
-class Args(object):
+class Args:
     def __init__(self, arch):
         self.operations = "all"
         self.build_dir = ""
diff --git a/python/aitemplate/backend/rocm/view_ops/__init__.py b/python/aitemplate/backend/rocm/view_ops/__init__.py
index 7fdffeffd..505398dde 100644
--- a/python/aitemplate/backend/rocm/view_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/view_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM view_ops module init
 """
-from . import view_ops
+from aitemplate.backend.rocm.view_ops import view_ops
 
 __all__ = ["view_ops"]
diff --git a/python/aitemplate/backend/rocm/view_ops/view_ops.py b/python/aitemplate/backend/rocm/view_ops/view_ops.py
index f11f20857..f41668fea 100644
--- a/python/aitemplate/backend/rocm/view_ops/view_ops.py
+++ b/python/aitemplate/backend/rocm/view_ops/view_ops.py
@@ -17,7 +17,7 @@
 """
 import jinja2
 
-from ....backend import registry
+from aitemplate.backend import registry
 
 SRC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/backend/rocm/vision_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/__init__.py
index fc8b18622..f46596197 100644
--- a/python/aitemplate/backend/rocm/vision_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/vision_ops/__init__.py
@@ -15,5 +15,8 @@
 """
 (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
 """
-from . import efficient_nms, nms  # noqa
-from .roi_ops import multi_level_roi_align, roi_align  # noqa  # noqa
+from aitemplate.backend.rocm.vision_ops import efficient_nms, nms  # noqa
+from aitemplate.backend.rocm.vision_ops.roi_ops import (  # noqa  # noqa
+    multi_level_roi_align,
+    roi_align,
+)
diff --git a/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
index 9d0c947bd..4a1fdb947 100644
--- a/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
+++ b/python/aitemplate/backend/rocm/vision_ops/efficient_nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.vision_ops import efficient_nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import efficient_nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/vision_ops/nms.py b/python/aitemplate/backend/rocm/vision_ops/nms.py
index 694f4d205..5f3c108e8 100644
--- a/python/aitemplate/backend/rocm/vision_ops/nms.py
+++ b/python/aitemplate/backend/rocm/vision_ops/nms.py
@@ -18,9 +18,9 @@
 
 from typing import Any, Dict
 
-from ... import registry
-from ...backend_spec import ROCMSpec
-from ...common.vision_ops import nms_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import nms_common
 
 # pylint: disable=C0301
 
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
index 6082dbff7..8e7fc3709 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/__init__.py
@@ -15,6 +15,6 @@
 """
 ROCM roi_align module init
 """
-from . import multi_level_roi_align, roi_align
+from aitemplate.backend.rocm.vision_ops.roi_ops import multi_level_roi_align, roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
index d875ae9c2..38c226ca9 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/multi_level_roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import ROCMSpec
-from ....common.vision_ops import multi_level_roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import multi_level_roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -46,9 +46,8 @@ def gen_function(
     x = func_attrs["inputs"][0]
     y = func_attrs["outputs"][0]
     backend_spec = ROCMSpec()
-    input_type = backend_spec.dtype_to_lib_type(x._attrs["dtype"])
-    output_type = backend_spec.dtype_to_lib_type(y._attrs["dtype"])
-
+    input_type = backend_spec.dtype_to_backend_type(x._attrs["dtype"])
+    output_type = backend_spec.dtype_to_backend_type(y._attrs["dtype"])
     exec_paths = ""
     for key, _ in exec_path.items():
         program = multi_level_roi_align_common.EXEC_TEMPLATE.render(
@@ -59,8 +58,10 @@ def gen_function(
             spatial_scale=func_attrs["spatial_scale"],
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
+            elem_input_type=input_type,
+            elem_output_type=output_type,
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return multi_level_roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
index 81ec65afd..76d3d7eae 100644
--- a/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/backend/rocm/vision_ops/roi_ops/roi_align.py
@@ -18,9 +18,9 @@
 
 import jinja2
 
-from .... import registry
-from ....backend_spec import ROCMSpec
-from ....common.vision_ops import roi_align_common
+from aitemplate.backend import registry
+from aitemplate.backend.backend_spec import ROCMSpec
+from aitemplate.backend.common.vision_ops import roi_align_common
 
 # pylint: disable=C0103,C0415,W0613,C0301,W0612
 
@@ -37,7 +37,7 @@
 def gen_function(
     func_attrs,
     template_path,
-    exec_cond_remplate,
+    exec_cond_template,
     shape_eval_template,
     shape_save_template,
 ):
@@ -81,7 +81,7 @@ def gen_function(
             position_sensitive=func_attrs["position_sensitive"],
             continuous_coordinate=func_attrs["continuous_coordinate"],
         )
-        exec_inst = exec_cond_remplate.render(indent="  ", cond=key, program=program)
+        exec_inst = exec_cond_template.render(indent="  ", cond=key, program=program)
         exec_paths += exec_inst
     return roi_align_common.SRC_TEMPLATE.render(
         function_name=func_name,
diff --git a/python/aitemplate/backend/target.py b/python/aitemplate/backend/target.py
index 057633c93..e0284add1 100644
--- a/python/aitemplate/backend/target.py
+++ b/python/aitemplate/backend/target.py
@@ -15,16 +15,20 @@
 """
 Target object for AITemplate.
 """
+import logging
 import os
 import pathlib
 import shutil
 import tempfile
 from enum import IntEnum
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple
 
-from ..utils import logger
-from . import registry
-from .profiler_cache import ProfileCacheDB
+from aitemplate.backend import registry
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+from aitemplate.utils.misc import is_linux
+
+
+_LOGGER = logging.getLogger(__name__)
 
 _MYPATH = os.path.dirname(os.path.realpath(__file__))
 _3RDPARTY_PATH = os.path.normpath(os.path.join(_MYPATH, "..", "..", "..", "3rdparty"))
@@ -46,7 +50,7 @@ class TargetType(IntEnum):
     rocm = 2
 
 
-class Target(object):
+class Target:
     def __init__(self, static_files_path: str):
         """
         Parameters
@@ -148,6 +152,10 @@ def make(self):
         make_path = shutil.which("make")
         return make_path if make_path is not None else "make"
 
+    def cmake(self):
+        cmake_path = shutil.which("cmake")
+        return cmake_path if cmake_path is not None else "cmake"
+
     def compile_cmd(self, executable: bool = False):
         """Compile command string template for this target.
 
@@ -169,7 +177,15 @@ def binary_compile_cmd(self):
         A command that turns a raw binary file into an object file that
         can be linked into the executable.
         """
-        return "ld -r -b binary -o {target} {src}"
+        cmd = "ld -r -b binary -o {target} {src}"
+        # Support models with >2GB constants on Linux only
+        if is_linux():
+            cmd += (
+                " && objcopy --rename-section"
+                " .data=.lrodata,alloc,load,readonly,data,contents"
+                " {target} {target}"
+            )
+        return cmd
 
     def compile_options(self) -> str:
         """Options for compiling the target.
@@ -237,13 +253,13 @@ def trick_ci_env(self) -> bool:
         """
         return os.environ.get("TRICK_CI_ENV", None) == "1"
 
-    def in_ci_env(self) -> Union[None, str]:
+    def in_ci_env(self) -> bool:
         """Check if the current environment is CI.
 
         Returns
         -------
-        Union[None, str]
-            CI environment name if in CI environment, otherwise None.
+        bool
+            Returns True if env CI_FLAG=CIRCLECI and TRICK_CI_ENV is not set (or 0).
         """
         return os.environ.get("CI_FLAG", None) == "CIRCLECI" and not self.trick_ci_env()
 
@@ -265,7 +281,7 @@ def disable_profiler_codegen(self) -> bool:
     def force_profile(self) -> bool:
         """Whether to force profile.
 
-        Force profiling regarless in_ci_env, disable_profiler_codegen
+        Force profiling regardless in_ci_env, disable_profiler_codegen
 
         Returns
         -------
@@ -294,9 +310,7 @@ def _get_cache_file_name(self) -> str:
     def _prepare_profile_cache_path(self) -> Optional[str]:
         """Prepare local profile cache for this target."""
         if self.use_dummy_profiling_results():
-            logger.info(
-                __name__, "Escape loading profile cache when using dummy profiling"
-            )
+            _LOGGER.info("Escape loading profile cache when using dummy profiling")
             return None
 
         prefix = None
@@ -309,10 +323,10 @@ def _prepare_profile_cache_path(self) -> Optional[str]:
         try:
             os.makedirs(prefix, exist_ok=True)
         except OSError as error:
-            logger.info(__name__, f"Cannot mkdir at {prefix} due to issue {error}")
+            _LOGGER.info(f"Cannot mkdir at {prefix} due to issue {error}")
             prefix = os.path.join(tempfile.mkdtemp(prefix="aitemplate_"), ".aitemplate")
             os.makedirs(prefix, exist_ok=True)
-            logger.info(__name__, f"mkdir at {prefix} instead")
+            _LOGGER.info(f"mkdir at {prefix} instead")
 
         cache_path = os.path.join(prefix, cache_file)
         flush_flag = os.environ.get("FLUSH_PROFILE_CACHE", "0")
@@ -326,7 +340,7 @@ def _load_profile_cache(self):
         if self._cache_path is None:
             return
 
-        logger.info(__name__, f"Loading profile cache from: {self._cache_path}")
+        _LOGGER.info(f"Loading profile cache from: {self._cache_path}")
         self._profile_cache = ProfileCacheDB(
             TargetType(self._target_type).name, path=self._cache_path
         )
@@ -355,23 +369,29 @@ def get_profile_cache_version(self, op_class: str) -> int:
         """
         # TODO: support conv and normalization
         if op_class == "gemm":
-            return self._profile_cache.get_profile_gemm_cache_version()
+            return self._profile_cache.gemm_cache_version
+        elif op_class == "conv":
+            return self._profile_cache.conv_cache_version
+        elif op_class == "conv3d":
+            return self._profile_cache.conv3d_cache_version
         raise NotImplementedError
 
-    def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
+    def query_profile_cache(
+        self, op_class: str, args: Dict[str, Any]
+    ) -> Tuple[str, int]:
         """Query the profile cache for the given op class and args.
 
         Parameters
         ----------
         op_class : str
             Op class name. gemm, conv or normalization
-        args : str
+        args : Dict[str, Any]
             Op arguments.
 
         Returns
         -------
-        Tuple[str]
-            Queried best profile results.
+        Tuple[str, int]
+            Queried best profiling results.
 
         Raises
         ------
@@ -388,7 +408,7 @@ def query_profile_cache(self, op_class: str, args: str) -> Tuple[str]:
             return self._profile_cache.query_normalization(args)
         raise NotImplementedError
 
-    def insert_profile_cache(self, op_class: str, args: str):
+    def insert_profile_cache(self, op_class: str, args: Dict[str, Any]):
         """Insert the profile cache for the given op class and args."""
         if op_class == "gemm":
             self._profile_cache.insert_gemm(args)
@@ -419,10 +439,7 @@ def copy_headers_and_csrc_to_workdir(self, workdir: str) -> List[str]:
             fname_dst, ext = os.path.splitext(fname)
             if ext != ".cpp":
                 continue
-            # TODO: Remove this file when the linker error gets fixed in rocm backend.
-            # All files in csrc should be shared between the ROCM and CUDA backends.
-            if fname == "rocm_hack.cpp" and self.name() != "rocm":
-                continue
+            
             fname_src = os.path.join(csrc, fname)
             fname_dst_cpp = os.path.join(workdir, f"{fname_dst}{self.src_extension()}")
             shutil.copyfile(fname_src, fname_dst_cpp)
@@ -452,6 +469,46 @@ def remote_logger(cls, record: Dict[str, Any]) -> None:
         """
         return
 
+    def get_include_directories(self) -> List[str]:
+        """
+        Returns a list of include directories for a compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def get_host_compiler_options(self) -> List[str]:
+        """
+        Returns a list of options for the host compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def get_device_compiler_options(self) -> List[str]:
+        """
+        Returns a list of options for the device compiler.
+
+        Raises
+        ------
+        NotImplementedError
+            Need to be implemented by subclass.
+        """
+        raise NotImplementedError
+
+    def postprocess_build_dir(self, build_dir: str) -> None:
+        """
+        Postprocess a build directory, allows final modification of the build directory before building.
+
+        """
+        pass
+
 
 def CUDA(template_path: str = CUTLASS_PATH, arch: str = "80", **kwargs):
     """Create a CUDA target."""
diff --git a/python/aitemplate/backend/task_runner.py b/python/aitemplate/backend/task_runner.py
index a4714715b..fd0c3d0dd 100644
--- a/python/aitemplate/backend/task_runner.py
+++ b/python/aitemplate/backend/task_runner.py
@@ -23,9 +23,11 @@
 import time
 import typing
 from collections import OrderedDict
+from typing import List
+
 
 # pylint: disable=R1732,R1710,R1721
-class Task(object):
+class Task:
     """Task is an object containing a bash command,
     process for the command, and output of the process.
     """
@@ -187,18 +189,18 @@ def __del__(self) -> None:
                 self._proc.stderr.close()
 
 
-class DeviceFarm(object):
+class DeviceFarm:
     """Device Farm is a stateful object to
     schedule and assigns a task to the available devices.
     Devices are logical devices, can be CPUs or GPUs.
     """
 
-    def __init__(self, devs: list[int]) -> None:
+    def __init__(self, devs: List[int]) -> None:
         """Initialize a Device Farm given a list of device ids.
 
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             List of device ids in int
         """
         if isinstance(devs, int):
@@ -240,14 +242,14 @@ def reset_all(self) -> None:
             self._dev_stats[dev] = False
 
 
-class BaseRunner(object):
+class BaseRunner:
     """Genetic subprocess task runner for different purposes"""
 
-    def __init__(self, devs: list[int], tag: str, timeout: int = 10) -> None:
+    def __init__(self, devs: List[int], tag: str, timeout: int = 10) -> None:
         """
         Parameters
         ----------
-        devs : list[int]
+        devs : List[int]
             List of device ids for tasks.
         tag : str
             Runner's name tag
@@ -287,9 +289,7 @@ def reset(self) -> None:
         self._finished_tasks = set()
         self._queue = []
 
-    def pull(
-        self, ftask_proc: typing.Callable, fret_proc: typing.Callable
-    ) -> list[object]:
+    def pull(self, ftask_proc: typing.Callable, fret_proc: typing.Callable) -> List:
         """Pull results from all tasks executed on the runner.
 
         Parameters
@@ -301,7 +301,7 @@ def pull(
 
         Returns
         -------
-        list
+        List
             Aggregated returns from all tasks
         """
         ret = []
diff --git a/python/aitemplate/compiler/__init__.py b/python/aitemplate/compiler/__init__.py
index 315577d39..78581152b 100644
--- a/python/aitemplate/compiler/__init__.py
+++ b/python/aitemplate/compiler/__init__.py
@@ -12,15 +12,16 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from . import base, dtype, ops, tensor_accessor, transform
-from .compiler import compile_model
-from .model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
+from aitemplate.compiler import base, dtype, ops, tensor_accessor, transform
+from aitemplate.compiler.compiler import compile_model
+from aitemplate.compiler.model import AIT_DEFAULT_NUM_RUNTIMES, AITData, Model
 
 __all__ = [
     "base",
     "dtype",
     "op_registry",
     "ops",
+    "symbolic",
     "tensor_accessor",
     "transform",
     "compile_model",
diff --git a/python/aitemplate/compiler/base.py b/python/aitemplate/compiler/base.py
index df03965ca..7272f0f4a 100644
--- a/python/aitemplate/compiler/base.py
+++ b/python/aitemplate/compiler/base.py
@@ -17,22 +17,29 @@
 """
 from __future__ import annotations
 
+import copy
+
+import math
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from functools import reduce
+from numbers import Number
 from pprint import pformat
-from typing import Any, Dict, List, Set, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Union
 
 import numpy as np
+import sympy
 
+from aitemplate.compiler import symbolic
 from aitemplate.compiler.dtype import get_dtype_size, normalize_dtype
+from aitemplate.compiler.op_registry import OP_REGISTRY
 
 from aitemplate.compiler.stable_set import StableSet
-from aitemplate.utils.torch_utils import torch_dtype_to_string
 
-from ..utils.tensor_utils import wrap_dim
-from .op_registry import OP_REGISTRY
+from aitemplate.utils.tensor_utils import wrap_dim
+from aitemplate.utils.torch_utils import torch_dtype_to_string
 
 # pylint: disable=C0206,W0613,C0201,W0102,W0231,W0233
 
@@ -82,18 +89,22 @@ class IntVar(Node):
     """
     An IntVar represents a dynamic dimension.
     IntVar and IntImm (see below) are used together to represent a Tensor's shape.
+
+    IntVar supports basic arithmetic operations, and returns the most conservative
+    IntVar w.r.t. range of _attrs["values"].
     """
 
     def __init__(
         self,
         values: List[int],
         name: str = None,
+        symbolic_value: Optional[sympy.Basic] = None,
     ) -> None:
         """Initializes an IntVar.
 
         Parameters
         ----------
-        values : list[int]
+        values : List[int]
             A list of possible values of this dynamic dimension.
             len(values) must be >= 2.
 
@@ -108,6 +119,9 @@ def __init__(
         name : str, optional
             Name of this dimension, by default None.
             This field must be set for dims which are used by input tensors.
+
+        symbolic_value: sympy.Basic, optional
+            The symbolic value for this IntVar. If None is provided, we will generate a symbol for this IntVar.
         """
         super().__init__()
         self._attrs["name"] = name
@@ -124,7 +138,13 @@ def __init__(
             )
         self._attrs["values"] = sorted(set(values))
         if len(self._attrs["values"]) == 1:
+            self._attrs["symbolic_value"] = self._attrs["values"][0]
             self._attrs["values"] = self._attrs["values"] * 2
+        else:
+            if symbolic_value is None:
+                symbolic_value = symbolic.create_new_symbol(name, values)
+                symbolic.store_intvar(symbolic_value.name, self)
+            self._attrs["symbolic_value"] = symbolic_value
 
     def __str__(self) -> str:
         return pformat(self._attrs, indent=2)
@@ -132,12 +152,153 @@ def __str__(self) -> str:
     def __eq__(self, another: Any) -> bool:
         return (
             isinstance(another, IntVar)
-            and self._attrs["values"] == another._attrs["values"]
-            and self._attrs["name"] == another._attrs["name"]
+            and self._attrs["symbolic_value"] == another._attrs["symbolic_value"]
         )
 
     def __hash__(self) -> int:
-        return hash((self._attrs["name"], tuple(self._attrs["values"])))
+        return hash(
+            (
+                self._attrs["name"],
+                tuple(self._attrs["values"]),
+                self._attrs["symbolic_value"],
+            )
+        )
+
+    def __add__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym + other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym + other
+        else:
+            raise NotImplementedError(f"Unable to do addition on {self} and {other}")
+
+        new_values = [
+            self_values[0] + other_values[0],
+            self_values[-1] + other_values[-1],
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __radd__(self, other: Union[Any, IntVar]) -> IntVar:
+        return self + other
+
+    def __sub__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym - other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym - other
+        else:
+            raise NotImplementedError(f"Unable to do subtraction on {self} and {other}")
+
+        new_values = [
+            max(0, self_values[0] - other_values[-1]),
+            max(0, self_values[-1] - other_values[0]),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rsub__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = other._attrs["symbolic_value"] - new_sym
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = other - new_sym
+        else:
+            raise NotImplementedError(
+                f"Unable to do r-subtraction on {self} and {other}"
+            )
+
+        new_values = [
+            max(0, other_values[0] - self_values[-1]),
+            max(0, other_values[-1] - self_values[0]),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __mul__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym * other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym * other
+        else:
+            raise NotImplementedError(
+                f"Unable to do multiplication on {self} and {other}"
+            )
+
+        new_values = [
+            self_values[0] * other_values[0],
+            self_values[-1] * other_values[-1],
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rmul__(self, other: Union[Any, IntVar]) -> IntVar:
+        return self * other
+
+    def __truediv__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = new_sym / other._attrs["symbolic_value"]
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = new_sym / other
+        else:
+            raise NotImplementedError(f"Unable to do division on {self} and {other}")
+
+        new_values = [
+            math.floor(self_values[0] / max(1, other_values[-1])),
+            math.ceil(self_values[-1] / max(1, other_values[0])),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
+
+    def __rtruediv__(self, other: Union[Any, IntVar]) -> IntVar:
+        self_values = self._attrs["values"]
+        new_sym = self._attrs["symbolic_value"]
+        if isinstance(other, IntVar):
+            other_values = other._attrs["values"]
+            new_sym = other._attrs["symbolic_value"] / new_sym
+        elif isinstance(other, Number):
+            other_values = [other]
+            new_sym = other / new_sym
+        else:
+            raise NotImplementedError(f"Unable to do r-division on {self} and {other}")
+
+        new_values = [
+            math.floor(other_values[0] / max(1, self_values[-1])),
+            math.ceil(other_values[-1] / max(1, self_values[0])),
+        ]
+        if new_values[0] == new_values[1]:
+            return IntImm(value=new_values[0])
+
+        return IntVar(values=new_values, symbolic_value=new_sym)
 
     def lower_bound(self) -> int:
         """Returns lower bound of this dynamic dim."""
@@ -147,6 +308,10 @@ def upper_bound(self) -> int:
         """Returns upper bound of this dynamic dim."""
         return self._attrs["values"][-1]
 
+    def symbolic_value(self):
+        """Returns the symbolic value of this dynamic dim."""
+        return self._attrs["symbolic_value"]
+
     def pseudo_code(self, with_shape=False) -> str:
         return (
             self._attrs["name"]
@@ -188,6 +353,7 @@ def __init__(
         Node.__init__(self)  # pylint: disable=W0233
         self._attrs["name"] = name
         self._attrs["values"] = [value]
+        self._attrs["symbolic_value"] = value
 
     def __eq__(self, another: Union[int, IntVar]) -> bool:
         if isinstance(another, int):
@@ -206,6 +372,232 @@ def pseudo_code(self, with_shape=False) -> str:
         return str(self.value())
 
 
+class JaggedDim(Node):
+    """
+    A class representing a single jagged dimension encoded within a JaggedIntVar.
+    Each instance contains the min and max value for the variable-length jagged
+    dimension. It is also associated with the rank-1 offsets Tensor representing
+    the layout of the jagged dimension within the JaggedIntVar. The offsets are
+    associated with the JaggedDim instances after creation, while creating
+    a jagged tensor with the make_jagged op.
+
+    See the docstring of the JaggedIntVar class for details.
+    """
+
+    def __init__(
+        self,
+        min_value: IntVar,
+        max_value: IntVar,
+    ):
+        """Initializes a JaggedDim.
+
+        Parameters
+        ----------
+        min_value : IntVar
+            Minimum possible value of the jagged dimension.
+        max_value : IntVar
+            Maximum possible value of the jagged dimension.
+        """
+        if isinstance(min_value, int):
+            min_value = IntImm(min_value)
+        if isinstance(max_value, int):
+            max_value = IntImm(max_value)
+
+        if min_value.lower_bound() < 0:
+            raise ValueError(f"{min_value=}, but must be non-negative.")
+        if min_value.lower_bound() > max_value.upper_bound():
+            raise ValueError(f"{min_value=} can't be larger than {max_value=}.")
+
+        super().__init__()
+
+        self._attrs["values"] = [min_value, max_value]
+        self._attrs["offsets"] = None
+
+    def __eq__(self, another: JaggedDim) -> bool:
+        return (
+            isinstance(another, JaggedDim)
+            and self.min_value() == another.min_value()
+            and self.max_value() == another.max_value()
+            and self.offsets() == another.offsets()
+        )
+
+    def __str__(self) -> str:
+        attrs = dict(self._attrs)
+        if self._attrs["offsets"] is not None:
+            attrs["offsets"] = {"name": self._attrs["offsets"]._attrs["name"]}
+        return str(attrs)
+
+    def min_value(self) -> IntVar:
+        """The minimum possible value of the JaggedDim."""
+        return self._attrs["values"][0]
+
+    def max_value(self) -> IntVar:
+        """The maximum possible value of the JaggedDim."""
+        return self._attrs["values"][1]
+
+    def offsets(self) -> Optional[Tensor]:
+        """The rank-1 offsets Tensor associated with the JaggedDim"""
+        return self._attrs["offsets"]
+
+    def pseudo_code(self, with_shape=False) -> str:
+        return f"JaggedDim({str(self._attrs['values'])})"
+
+
+class JaggedIntVar(IntVar):
+    """
+    JaggedIntVar is a specific case of IntVar that encodes one or more jagged
+    dimensions within itself. JaggedIntVar is used as the first dimension in
+    jagged Tensors' shape (this is, basically, what makes a Tensor jagged).
+    E.g., a JaggedIntVar with a single JaggedDim represents a single dynamic
+    dimension encoding a batch of variable sequence length. For the batch
+    size of B, in some sources this is indicated as sum_B(N_B): the sum of
+    individual sequence lengths: N_1, N_2, ..., N_B of B sequences. This sum
+    is represented as a single dynamic dimension: total_length, with B being
+    defined by the batch_dim.
+
+    Because JaggedIntVar is an IntVar, it can be treated so by the AIT ops
+    that are unaware of the jagged Tensor semantics. But the ops that are
+    aware can interpret the JaggedIntVar as the first dimension of the jagged
+    Tensor by specifically processing the underlying batch_dim and jagged_dims.
+
+    If there is more than one JaggedDim in a JaggedIntVar, those jagged dimensions
+    are nested within the single dynamic dimension. E.g., if there are two JaggedDims,
+    the JaggedIntVar represents a batch of B (batch_dim) variable-length sequences,
+    each in turn consisting of variable-length sequences. In principle, the nesting
+    can be arbitrarily deep, but in practice it's usually just a single JaggedDim.
+
+    JaggedIntVar should not be created directly. Please use the make_jagged op
+    for creating a jagged Tensor from a normal Tensor, the offsets, and the
+    metadata (like batch_dim and jagged_dims). The make_jagged op creates the
+    corresponding JaggedIntVar under the hood.
+    """
+
+    def __init__(
+        self,
+        total_length: IntVar,
+        batch_dim: IntVar,
+        jagged_dims: List[JaggedDim],
+    ):
+        """Initializes a JaggedIntVar.
+
+        Parameters
+        ----------
+        total_length : IntVar
+            The existing IntVar defining the total length sum_B(N_B) of the
+            JaggedIntVar. The "name" and "values" attributes of the JaggedIntVar
+            are the same as those of the total_length. This allows transparent
+            treatment of the jagged Tensor as dense by non-jagged-aware ops.
+            Must be a dynamic dim (IntVar, not IntImm).
+        batch_dim : IntVar
+            The batch dimension B in the sum_B(N_B) representation of the
+            JaggedIntVar. Specifies the number of (outermost) variable-length
+            sequences encoded within the JaggedIntVar. Must be a dynamic dim
+            (IntVar, not IntImm).
+        jagged_dims : List[JaggedDim]
+            One or more jagged dimension encoded in the JaggedIntVar. Each
+            JaggedDim specifies the bounds of one level of nested jaggedness
+            of the JaggedIntVar. See the class docstring for details.
+            The list must contain at least one JaggedDim. All JaggedDims
+            in the list must have their offsets already set to the
+            corresponding rank-1 Tensors.
+        """
+        if total_length is None or type(total_length) != IntVar:
+            raise TypeError(
+                "total_length must be dynamic (IntVar), "
+                f"but given {type(total_length).__name__}."
+            )
+        if not jagged_dims or not all(
+            isinstance(dim, JaggedDim) for dim in jagged_dims
+        ):
+            raise TypeError(
+                "jagged_dims must be a non-empty list of JaggedDims, "
+                f"but given {jagged_dims}."
+            )
+        offsets_types = set()
+        for i, dim in enumerate(jagged_dims):
+            if dim.offsets() is None:
+                raise ValueError(
+                    f"JaggedDim {i} in the jagged_dims list has no associated offsets. "
+                    "This probably means that the JaggedIntVar is instantiated directly. "
+                    "Instead, jagged Tensor must be created by calling the make_jagged op."
+                )
+            else:
+                offsets_type = dim.offsets()._attrs["dtype"]
+                if offsets_type not in ["int32", "int64"]:
+                    raise TypeError(
+                        "The offsets Tensors can be either int32 or int64, "
+                        f"but given the Tensor of type {offsets_type}."
+                    )
+                offsets_types.add(offsets_type)
+        if len(offsets_types) > 1:
+            raise TypeError(
+                "All offsets Tensors must be of the same type,"
+                f" but given the Tensors of different types: {offsets_types}."
+            )
+
+        super().__init__(
+            values=total_length._attrs["values"],
+            name=total_length._attrs["name"],
+            symbolic_value=total_length._attrs["symbolic_value"],
+        )
+
+        self._attrs["batch_dim"] = batch_dim
+        self._attrs["jagged_dims"] = jagged_dims
+        self._attrs["offsets_type"] = f"{offsets_types.pop()}_t"
+        self._total_length = total_length
+
+    def __eq__(self, another: JaggedIntVar) -> bool:
+        return (
+            isinstance(another, JaggedIntVar)
+            and self.total_length() == another.total_length()
+            and self.batch_dim() == another.batch_dim()
+            and self.jagged_dims() == another.jagged_dims()
+        )
+
+    def __hash__(self) -> int:
+        return hash((self._attrs["name"], tuple(self._attrs["values"])))
+
+    def total_length(self) -> IntVar:
+        """The total_length dimension the JaggedIntVar is based on."""
+        return self._total_length
+
+    def batch_dim(self) -> IntVar:
+        """The batch_dim of the JaggedIntVar."""
+        return self._attrs["batch_dim"]
+
+    def jagged_dims(self) -> List[JaggedDim]:
+        """The jagged_dims of the JaggedIntVar."""
+        return self._attrs["jagged_dims"]
+
+    def offsets_type(self) -> str:
+        """The type of the offsets of the JaggedIntVar's jagged_dims."""
+        return self._attrs["offsets_type"]
+
+    def offsets_var_name(self) -> str:
+        """The name of the offsets struct variable in runtime."""
+        name = self._attrs["name"]
+        if name is None:
+            raise RuntimeError("The JaggedIntVar is not named yet")
+        return f"{name}_jagged_offsets"
+
+    def offsets_struct_type(self) -> str:
+        """The type of the offsets struct variable used in runtime."""
+        num_jagged_dims = len(self.jagged_dims())
+        return f"ait::JaggedOffsets<{self.offsets_type()}, {num_jagged_dims}>"
+
+    def get_max_dense_shape(self) -> List[IntVar]:
+        """
+        Returns a list of IntVars representing the maximum dense shape
+        (rectangular volume) that the JaggedIntVar can correspond to.
+        The result has the batch_dim as the first item and the IntImm
+        with the max_value of each JaggedDim that follows.
+        """
+        result = [self.batch_dim()]
+        for dim in self.jagged_dims():
+            result.append(dim.max_value())
+        return result
+
+
 def get_aligned_size(shape: List[IntVar], dtype: str, alignment: int = 64) -> int:
     """Returns aligned size (in bytes) of given shape and dtype.
 
@@ -299,7 +691,20 @@ def __init__(self, tensor):
         self.tensor = tensor
 
     def to_bytes(self) -> bytes:
-        return self.tensor.cpu().detach().numpy().tobytes()
+        if self.size() == 0:
+            return b""
+
+        import ctypes
+
+        t = self.tensor.contiguous().cpu().detach()
+        # We used to do tensor().numpy().tobytes() here,
+        # but numpy doesn't support bfloat16 natively,
+        # so we obtain the underlying C array.
+        # Results are flaky when tensor is not bound to a local variable.
+        raw_array = ctypes.cast(
+            t.data_ptr(), ctypes.POINTER(ctypes.c_ubyte * self.size())
+        )
+        return bytes(raw_array.contents)
 
     def size(self) -> int:
         """
@@ -331,13 +736,15 @@ def __init__(
         self,
         shape: List[IntVar],
         name: str = None,
-        src_ops: StableSet[Node] = None,
-        dst_ops: StableSet[Node] = None,
+        src_ops: Iterable[Node] = None,
+        dst_ops: Iterable[Node] = None,
         dtype: str = "float16",
         is_input: bool = False,
         is_output: bool = False,
         value: Any = None,
         is_view_of: Any = None,
+        is_internal_constant: bool = False,
+        skip_constant_folding: bool = False,
         check_nan_and_inf: bool = False,
         check_outputs: bool = False,
     ) -> None:
@@ -348,16 +755,16 @@ def __init__(
         shape : List[IntVar]
             Shape of this Tensor.
         name : str, optional
-            Name of this Tensor. By default it's None.
-        src_ops : Set[Node], optional
+            Name of this Tensor. By default, it's None.
+        src_ops : Iterable[Node], optional
             Source operators of this Tensor which write to this Tensor.
-            By default it's an empty set.
-        dst_ops : Set[Node], optional
+            By default, it's an empty set.
+        dst_ops : Iterable[Node], optional
             Destination operators of this Tensor which take this Tensor as
             one of their inputs.
-            By default it's an empty set.
+            By default, it's an empty set.
         dtype : str, optional
-            Date type of this Tensor. By default it's "float16".
+            Date type of this Tensor. By default, it's "float16".
         is_input : bool, optional
             Whether this Tensor is an input Tensor of a graph.
             Note that constant Tensors (e.g. weights) are NOT input Tensors.
@@ -368,6 +775,10 @@ def __init__(
             empty list, this Tensor is used to represent a number.
         is_view_of : Any, optional
             Whether this Tensor is a view of another Tensor.
+        is_internal_constant: bool, optional
+            Whether this constant tensor could be modified.
+        skip_constant_folding: bool, optional
+            Whether this tensor participates in constant folding.
         check_nan_and_inf : bool, optional
             Whether or not to check this tensor is nan or inf during runtime.
         check_outputs : bool, optional
@@ -376,16 +787,14 @@ def __init__(
         super().__init__()
         self._attrs["shape"] = self._convert_shape(shape)
         self._attrs["name"] = name
-        self._attrs["src_ops"] = (
-            StableSet(src_ops) if src_ops is not None else StableSet()
-        )
-        self._attrs["dst_ops"] = (
-            StableSet(dst_ops) if dst_ops is not None else StableSet()
-        )
+        self._attrs["src_ops"] = StableSet(src_ops)
+        self._attrs["dst_ops"] = StableSet(dst_ops)
         self._attrs["dtype"] = dtype
         self._attrs["is_output"] = is_output
         self._attrs["is_input"] = is_input
         self._attrs["is_param"] = False
+        self._attrs["is_internal_constant"] = is_internal_constant
+        self._attrs["skip_constant_folding"] = skip_constant_folding
 
         # True if this is an internal tensor that aliases an output through
         # a view. Set up in mark_param_tensor
@@ -413,6 +822,8 @@ def __init__(
         # Data to be bound for constant folding. See _bind_data.
         self._attrs["data"] = None
 
+        self._attrs["constant_folding_output_idx"] = None
+
         self._attrs["check_nan_and_inf"] = check_nan_and_inf
         self._attrs["check_outputs"] = check_outputs
 
@@ -477,8 +888,14 @@ def is_a_const_num(self) -> bool:
         """Returns whether this Tensor represents a constant number."""
         return len(self._attrs["shape"]) == 0 and self._attrs["value"] is not None
 
+    def is_jagged(self) -> bool:
+        """Whether the Tensor is jagged (the first dim is JaggedIntVar)."""
+        return len(self._attrs["shape"]) > 0 and isinstance(
+            self._attrs["shape"][0], JaggedIntVar
+        )
+
     def size_bytes(self, alignment: int = 1) -> int:
-        """Returns acutal size (in bytes) of this Tensor."""
+        """Returns actual size (in bytes) of this Tensor."""
         return get_aligned_size(self._attrs["shape"], self.dtype(), alignment)
 
     def pseudo_code(self, with_shape=True) -> str:
@@ -496,6 +913,9 @@ def pseudo_code(self, with_shape=True) -> str:
         if data is not None:
             args.append(f"data=({data.size()} bytes)")
 
+        if self.is_jagged():
+            args.append("jagged=True")
+
         return f"Tensor({', '.join(args)})"
 
     def _bind_data(self, data: _ConstantTensorData) -> None:
@@ -526,6 +946,12 @@ def _bind_data(self, data: _ConstantTensorData) -> None:
             )
         self._attrs["data"] = data
 
+    def __deepcopy__(self, memo):
+        result = Tensor(self.shape())
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def __add__(self, other: Any) -> Tensor:
         return OP_REGISTRY.get("ADD")(self, other)
 
@@ -560,6 +986,7 @@ def _create_host_zero_tensor(
     dst_ops: Set[Node] = None,
     dtype: str = "float16",
     is_output: bool = False,
+    is_internal_constant: bool = True,
 ):
     """
     Create a zero tensor stored on the host machine.
@@ -569,6 +996,7 @@ def _create_host_zero_tensor(
         b"\x00" * get_aligned_size(shape, dtype, alignment=1), dtype=dtype
     )
     tensor = Tensor(shape, name, dst_ops=dst_ops, dtype=dtype, is_output=is_output)
+    tensor._attrs["is_internal_constant"] = is_internal_constant
     tensor._bind_data(zeros)
     return tensor
 
@@ -611,10 +1039,17 @@ def __init__(
             is_output=is_output,
         )
         self._attrs["int_var"] = int_var
+        self._attrs["symbolic_value"] = int_var._attrs["symbolic_value"]
 
     def pseudo_code(self, with_shape=True) -> str:
         return f"IntVarTensor({self._attrs['int_var'].pseudo_code()})"
 
+    def __deepcopy__(self, memo):
+        result = IntVarTensor(self._attrs["int_var"])
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def __add__(self, other: Any) -> Tensor:
         return OP_REGISTRY.get("INT_ADD")(self, other)
 
@@ -641,7 +1076,7 @@ def __rtruediv__(self, other: Any) -> Tensor:
 
 
 class DynamicProfileStrategy(Enum):
-    """Dynamic profiling stategy enum.
+    """Dynamic profiling strategy enum.
     Instances are used to select profiling strategy when there are dynamic dims.
     """
 
@@ -691,6 +1126,12 @@ def __call__(self, *args: List[Tensor]) -> List[Tensor]:
         """
         raise NotImplementedError
 
+    def __deepcopy__(self, memo):
+        result = type(self)(**self._get_op_attributes())
+        memo[id(self)] = result
+        result._attrs = copy.deepcopy(self._attrs, memo)
+        return result
+
     def _set_depth(self) -> None:
         """
         Sets operator depth and dst_ops.
@@ -748,7 +1189,7 @@ def profile(
             A list of device ids which can be used for profiling.
         dynamic_profiling_strategy: DynamicProfileStrategy, optional
             Profiling strategy used when there are dynamic dims.
-            By default MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
+            By default, MAX is used, i.e. to profile a dynamic range, an upper bound will be used.
         """
 
         return
@@ -795,10 +1236,6 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         This is used when we need to copy the op with identical behaviour.
 
-        Parameters
-        ----------
-        None
-
         Returns
         -------
         Dict of attributes
@@ -814,7 +1251,7 @@ def _outputs_for_pseudo_code(self):
         return self._attrs["outputs"]
 
     def _args_for_pseudo_code(self):
-        return []
+        return [f"{key}={value}" for key, value in self._get_op_attributes().items()]
 
     def _pseudo_code_helper(self, node: Any, with_shape: bool) -> str:
         if isinstance(node, list):
@@ -830,4 +1267,5 @@ def pseudo_code(self, with_shape=True):
         args = self._pseudo_code_helper(self._args_for_pseudo_code(), with_shape)
         inputs = self._pseudo_code_helper(self._inputs_for_pseudo_code(), with_shape)
         outputs = self._pseudo_code_helper(self._outputs_for_pseudo_code(), with_shape)
-        return f"({outputs}) \n= {self._attrs['op']}({args})(\n{inputs})\n"
+        name = self._attrs.get("name", None)
+        return f"# {name}\n({outputs}) \n= {self._attrs['op']}({args})(\n{inputs})\n"
diff --git a/python/aitemplate/compiler/compiler.py b/python/aitemplate/compiler/compiler.py
index 3d869ee89..ab2f5f45e 100644
--- a/python/aitemplate/compiler/compiler.py
+++ b/python/aitemplate/compiler/compiler.py
@@ -15,21 +15,36 @@
 """
 build a test module from a tensor
 """
+import logging
 import os
 from datetime import datetime
 from typing import Dict, List, Optional, Union
 
 from aitemplate import backend, compiler
-from aitemplate.compiler.model import AITemplateAllocatorKind
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    JaggedIntVar,
+    Tensor,
+)
+
+from aitemplate.compiler.model import (
+    AIT_DEFAULT_NUM_RUNTIMES,
+    AITemplateAllocatorKind,
+    Model,
+    TorchTensor,
+)
+from aitemplate.compiler.transform.name_graph import reset_name_counters
 from aitemplate.compiler.transform.profile import elapsed_dt_sec
-from aitemplate.utils import graph_utils, logger
+from aitemplate.utils import graph_utils
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.misc import callstack_stats
 from aitemplate.utils.serialization.serdes_code import dump_program
 
-from .base import DynamicProfileStrategy, Tensor
+# pylint: disable=W0102
 
-from .model import AIT_DEFAULT_NUM_RUNTIMES, Model, TorchTensor
 
-# pylint: disable=W0102
+_LOGGER = logging.getLogger(__name__)
 
 
 def _validate_tensor_args(sorted_graph: List[Tensor], output_tensors: List[Tensor]):
@@ -78,10 +93,59 @@ def _verify_outputs_still_in_graph(sorted_graph: List[Tensor], outputs: List[Ten
     for tensor, was_seen in seen.items():
         if not was_seen:
             raise ValueError(
-                f"Output {tensor._attrs['name']} was not found in the graph after opitmizations."
+                f"Output {tensor} was not found in the graph after optimizations."
             )
 
 
+def _mark_isolated_int_vars(sorted_graph: List[Tensor]):
+    """
+    Mark the IntVars that are not present in any input's shape
+    with the _attrs["isolated"] = True flag. The purpose is to
+    be able to distinguish these dynamic dims in the codegen
+    of some of the functions which should set them instead of
+    relying on / validating the pre-set value. To this end,
+    this function must be invoked right before the back-end
+    code generation of the ops.
+
+    One example is the padded_dense_to_jagged op that must set
+    the total_length dimension of the resulting jagged Tensor
+    if it hasn't been set from any of the model input's shape.
+    Another example is the make_jagged op that should set the
+    batch_dim within the JaggedIntVar of the resulting jagged
+    Tensor, unless it has been set already from the inputs.
+    """
+    int_vars = {}
+    int_var_names_in_input_shapes = set()
+    for tensor in sorted_graph:
+        for dim in tensor._attrs["shape"]:
+            if not isinstance(dim, IntImm):
+                name = dim._attrs["name"]
+                int_vars[name] = dim
+                if isinstance(dim, JaggedIntVar):
+                    batch_dim = dim.batch_dim()
+                    if not isinstance(batch_dim, IntImm):
+                        int_vars[batch_dim._attrs["name"]] = batch_dim
+                    total_length = dim.total_length()
+                    int_vars[total_length._attrs["name"]] = total_length
+                    for jagged_dim in dim.jagged_dims():
+                        min_value = jagged_dim.min_value()
+                        if not isinstance(min_value, IntImm):
+                            int_vars[min_value._attrs["name"]] = min_value
+                        max_value = jagged_dim.max_value()
+                        if not isinstance(max_value, IntImm):
+                            int_vars[max_value._attrs["name"]] = max_value
+                if tensor._attrs["is_input"]:
+                    int_var_names_in_input_shapes.add(name)
+
+    for name, dim in int_vars.items():
+        if name not in int_var_names_in_input_shapes:
+            dim._attrs["isolated"] = True
+
+
+_DEBUG_SETTINGS = AITDebugSettings()
+
+
+@callstack_stats()
 def compile_model(
     tensor: Union[Tensor, List[Tensor]],
     target: backend.target.Target,
@@ -94,9 +158,9 @@ def compile_model(
     profile_dir: str = None,
     constants: Optional[Dict[str, TorchTensor]] = None,
     allocator_kind: Optional[AITemplateAllocatorKind] = None,
-    check_all_nan_and_inf: bool = False,
-    check_all_outputs: bool = False,
-    dump_ait_to_py: Optional[str] = None,
+    debug_settings: AITDebugSettings = _DEBUG_SETTINGS,
+    do_optimize_graph: bool = True,
+    profile_timeout: int = 500,
 ) -> Model:
     """Compiles a model and generates a .so file.
 
@@ -121,15 +185,18 @@ def compile_model(
     num_runtimes: int
         How many runtimes should be stored in the internal pool. This
         determines how many inferences can happen concurrently. By
-        default, set to 2. Must be positive.
+        default, set to 1. Must be positive.
+    profile_dir: str
+        The base dir to generate profiling source codes. By default, workdir/test_name
+    constants: Dict[str, TorchTensor], optional
+        User-provided constants to bind to the graph. The constants can be folded and packaged into
+        the final *.so.
     allocator_kind: AITemplateAllocatorKind, optional
         The GPU allocator to use. If none is specified, use the default allocator.
-    check_all_nan_and_inf : bool, optional
-        Whether or not to check this tensor is nan or inf during runtime.
-    check_all_outputs : bool, optional
-        Whether or not to print this tensor's value out during runtime.
-    dump_ait_to_py: str, optional
-        The path where the AIT graph is dumped into a .py file.
+    debug_settings: AITDebugSettings
+        specify debug settings such as where to dump AITemplate model Python file, etc.
+    do_optimize_graph: bool
+        Apply full list of graph optimizations. Default: True
 
     Returns
     -------
@@ -139,22 +206,23 @@ def compile_model(
     if constants is None:
         constants = {}
 
-    recompile = os.getenv("RECOMPILE", "1")
+    recompile = os.getenv("AIT_RECOMPILE", "1")
     graph = None
+    os.makedirs(workdir, exist_ok=True)  # explicitly ensure workdir exists
     # Super important: we cannot have commas in the test name.
     # We want to add a -Iworkdir/test_name flag to nvcc, but
     # if the name has a comma in it, it will be parsed as two
     # arguments (even if we put quotes around it)!!
     test_name = test_name.replace(",", "_")
     test_dir = os.path.join(workdir, test_name)
-    profile_dir = workdir if profile_dir is None else profile_dir
-
-    if dump_ait_to_py:
-        dump_program(tensor, dump_ait_to_py)
+    _LOGGER.info(f"Start to compile AIT model. {test_dir=}")
+    if profile_dir is None:
+        profile_dir = workdir
 
     if int(recompile) == 1:
         os.makedirs(test_dir, exist_ok=True)
         with target:
+            reset_name_counters()
             graph = compiler.transform.toposort(tensor)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "toposort")
 
@@ -175,17 +243,25 @@ def compile_model(
             compiler.transform.name_graph(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "name_graph")
 
+            if debug_settings.dump_ait_to_py:
+                dump_program(tensor, debug_settings.dump_ait_to_py)
+
+            compiler.transform.dedup_symbolic_name(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "dedup_symbolic_name"
+            )
+
             compiler.transform.mark_param_tensor(graph)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "mark_param_tensor"
             )
 
             start_t = datetime.now()
-            graph = compiler.transform.optimize_graph(graph, test_dir)
-            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
-            logger.info(
-                __name__, f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}"
+            graph = compiler.transform.optimize_graph(
+                graph, test_dir, optimize=do_optimize_graph
             )
+            graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "optimize_graph")
+            _LOGGER.info(f"optimized graph elapsed time: {elapsed_dt_sec(start_t)}")
 
             compiler.transform.mark_special_views(graph)
             compiler.transform.refine_graph(graph)
@@ -198,30 +274,43 @@ def compile_model(
                 else:
                     profile_devs = device_env.split(",")
             compiler.transform.profile(
-                graph, profile_dir, profile_devs, dynamic_profiling_strategy
+                graph,
+                profile_dir,
+                profile_devs,
+                dynamic_profiling_strategy,
+                profile_timeout,
             )
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "profile")
 
             start_t = datetime.now()
             constant_folding_workdir = os.path.join(workdir, test_name)
             os.makedirs(constant_folding_workdir, exist_ok=True)
-            graph = compiler.transform.constant_folding(graph, constant_folding_workdir)
+            (
+                graph,
+                constant_folding_file_pairs,
+                constant_folding_inputs,
+            ) = compiler.transform.constant_folding(graph, workdir, test_name)
             graph_utils.dump_graph_debug_str_to_file(
                 graph, test_dir, "constant_folding"
             )
-            logger.info(
-                __name__, f"folded constants elapsed time: {elapsed_dt_sec(start_t)}"
+            _LOGGER.info(f"folded constants elapsed time: {elapsed_dt_sec(start_t)}")
+
+            compiler.transform.dedup_symbolic_name(graph)
+            graph_utils.dump_graph_debug_str_to_file(
+                graph, test_dir, "dedup_symbolic_name"
             )
 
-            _verify_outputs_still_in_graph(graph, output_tensors)
             (
                 max_blob,
                 max_constant_blob,
                 workspace,
             ) = compiler.transform.memory_planning(graph)
+            _verify_outputs_still_in_graph(graph, output_tensors)
+            _mark_isolated_int_vars(graph)
             graph_utils.dump_graph_debug_str_to_file(graph, test_dir, "memory_planning")
 
             file_pairs = backend.codegen.gen_function_src(graph, workdir, test_name)
+            file_pairs.extend(constant_folding_file_pairs)
 
             # It's possible that the original output tensor has been replaced with a new tensor.
             # Preserve original output tensors' orders but use the new tensors.
@@ -244,16 +333,17 @@ def compile_model(
                 workdir,
                 output_tensors,
                 test_name,
-                check_all_nan_and_inf,
-                check_all_outputs,
+                additional_unbound_constants=constant_folding_inputs,
+                debug_settings=debug_settings,
             )
             file_pairs.extend(main_pairs)
 
             start_t = datetime.now()
-            compile_engine = backend.builder.Builder()
-            compile_engine.make(file_pairs, dll_name, workdir, test_name)
-            logger.info(
-                __name__,
+            compile_engine = backend.builder.get_compile_engine()
+            compile_engine.make(
+                file_pairs, dll_name, workdir, test_name, debug_settings
+            )
+            _LOGGER.info(
                 f"compiled the final .so file elapsed time: {elapsed_dt_sec(start_t)}",
             )
 
diff --git a/python/aitemplate/compiler/dtype.py b/python/aitemplate/compiler/dtype.py
index 51b6c96d6..029ea4197 100644
--- a/python/aitemplate/compiler/dtype.py
+++ b/python/aitemplate/compiler/dtype.py
@@ -25,6 +25,7 @@
     "int": 4,
     "int32": 4,
     "int64": 8,
+    "bfloat16": 2,
 }
 
 
@@ -41,6 +42,7 @@
     "int32": 3,
     "int64": 4,
     "bool": 5,
+    "bfloat16": 6,
 }
 
 
@@ -130,7 +132,27 @@ def _impl(dtype):
             return "kLong"
         elif dtype == "bool":
             return "kBool"
+        elif dtype == "bfloat16":
+            return "kBFloat16"
         else:
             raise AssertionError(f"unknown dtype {dtype}")
 
     return f"AITemplateDtype::{_impl(dtype)}"
+
+
+def is_same_dtype(dtype1: str, dtype2: str) -> bool:
+    """Returns True if dtype1 and dtype2 are the same dtype and False otherwise.
+
+    Parameters
+    ----------
+    dtype1: str
+        A data type string.
+    dtype2: str
+        A data type string.
+
+    Returns
+    ----------
+    bool
+        whether dtype1 and dtype2 are the same dtype
+    """
+    return normalize_dtype(dtype1) == normalize_dtype(dtype2)
diff --git a/python/aitemplate/compiler/model.py b/python/aitemplate/compiler/model.py
index a7281f1d3..6efa5c50b 100644
--- a/python/aitemplate/compiler/model.py
+++ b/python/aitemplate/compiler/model.py
@@ -19,12 +19,14 @@
 import enum
 import logging
 import math
+import struct
 from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union
 
 import numpy as np
 
 from aitemplate.compiler.dtype import dtype_str_to_enum
-from aitemplate.utils.torch_utils import torch_dtype_to_string
+from aitemplate.utils.misc import is_linux, is_windows
+from aitemplate.utils.torch_utils import torch_dtype_to_string, write_tensor_binary
 
 # Controls how many runtimes will be used in ModelContainer by default.
 # See the runtime README.md for more information on the Model/ModelContainer
@@ -39,14 +41,64 @@
 TorchTensor = TypeVar("TorchTensor")
 
 
+class AITemplateMemcpyKind(enum.Enum):
+    HostToDevice = 0
+    DeviceToHost = 1
+    DeviceToDevice = 2
+
+
+class AITemplateAllocatorKind(enum.Enum):
+    DEFAULT = 0
+    TRACKING = 1
+
+
+class AITData(NamedTuple):
+    """
+    Input or output tensor for Model.run. We require the extra data for safety
+    checks inside the runtime.
+    """
+
+    data_ptr: int
+    shape: List[int]
+    dtype: str
+
+
+class _AITemplateShape(ctypes.Structure):
+    _fields_ = [
+        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
+        ("size", ctypes.c_size_t),
+    ]
+
+
+class _CFormatAITData(ctypes.Structure):
+    _fields_ = [
+        ("pointer", ctypes.c_void_p),
+        ("shape", _AITemplateShape),
+        ("dtype", ctypes.c_int),
+    ]
+
+
 def _dlclose(dll: ctypes.CDLL):
-    syms = ctypes.CDLL(None)
-    if hasattr(syms, "dlclose"):
-        f_dlclose = syms.dlclose
+    f_dlclose = None
+
+    if is_windows():
+        f_dlclose = ctypes.windll.kernel32.FreeLibrary
+    elif is_linux():
+        syms = ctypes.CDLL(None)
+        if not hasattr(syms, "dlclose"):
+            # Apline Linux
+            syms = ctypes.CDLL("libc.so")
+
+        if hasattr(syms, "dlclose"):
+            f_dlclose = syms.dlclose
+
+    if f_dlclose is not None:
         f_dlclose.argtypes = [ctypes.c_void_p]
         f_dlclose(dll._handle)
     else:
-        logging.warning("dlclose() not found, library may not be unloaded properly!")
+        logging.warning(
+            "dll unloading function was not found, library may not be unloaded properly!"
+        )
 
 
 def _check_tensors(
@@ -84,7 +136,7 @@ def is_bad_tensor(tensor: TorchTensor) -> bool:
     _check_tensors(tensors, is_bad_tensor, name, "contiguous and on host")
 
 
-def torch_to_ait_data(tensor):
+def torch_to_ait_data(tensor: TorchTensor) -> AITData:
     """
     Convert a torch Tensor to a AITData.
     """
@@ -93,7 +145,7 @@ def torch_to_ait_data(tensor):
     )
 
 
-def _convert_tensor_args(params):
+def _convert_tensor_args(params: Union[List[TorchTensor], Dict[str, TorchTensor]]):
     """
     Helper function for the WithTensors APIs.
     """
@@ -117,74 +169,18 @@ def _reshape_tensor(tensor: TorchTensor, shape: List[int]) -> TorchTensor:
     return new_tensor.reshape(shape)
 
 
-class AITemplateMemcpyKind(enum.Enum):
-    HostToDevice = 0
-    DeviceToHost = 1
-    DeviceToDevice = 2
-
-
-class AITemplateAllocatorKind(enum.Enum):
-    DEFAULT = 0
-    TRACKING = 1
-
-
-class AITData(NamedTuple):
-    """
-    Input or output tensor for Model.run. We require the extra data for safety
-    checks inside the runtime.
-    """
-
-    data_ptr: int
-    shape: List[int]
-    dtype: str
-
-
-class _AITemplateShape(ctypes.Structure):
-    _fields_ = [
-        ("shape_data", ctypes.POINTER(ctypes.c_longlong)),
-        ("size", ctypes.c_size_t),
-    ]
-
-
-class _CFormatAITData(ctypes.Structure):
-    _fields_ = [
-        ("pointer", ctypes.c_void_p),
-        ("shape", _AITemplateShape),
-        ("dtype", ctypes.c_int),
-    ]
-
-
-class Model(object):
+class Model:
     class _DLLWrapper:
         def __init__(
             self,
             lib_path: str,
-            num_runtimes: int,
-            allocator_kind: Optional[AITemplateAllocatorKind],
         ):
             self.lib_path = lib_path
             self.DLL = ctypes.cdll.LoadLibrary(lib_path)
-
-            self.handle = ctypes.c_void_p()
-            self.allocator_handle = ctypes.c_void_p()
-            if allocator_kind is not None:
-                self.DLL.AITemplateAllocatorCreate(
-                    ctypes.byref(self.allocator_handle),
-                    ctypes.c_int(allocator_kind.value),
-                )
-
-            self.DLL.AITemplateModelContainerCreate(
-                ctypes.pointer(self.handle),
-                ctypes.c_size_t(num_runtimes),
-                self.allocator_handle,
-            )
             self.is_open = True
 
         def close(self):
             if self.is_open:
-                self.DLL.AITemplateModelContainerDelete(self.handle)
-                if self.allocator_handle:
-                    self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
                 _dlclose(self.DLL)
                 self.is_open = False
 
@@ -217,16 +213,33 @@ def __init__(
         num_runtimes : int, optional
             How many runtimes should be stored in the internal pool. This
             determines how many inferences can happen concurrently. By
-            default, set to 2. Must be positive.
+            default, set to 1. Must be positive.
         allocator_kind : AITemplateAllocatorKind, optional
             What type of allocator to use when allocating GPU memory.
         """
+        # Set of pointers allocated with numpy_to_ait_data.
+        # If the user forgets to free their data, we use this to
+        # avoid leaking memory.
+        self._allocated_ait_data = set()
+
         if num_runtimes <= 0:
             raise ValueError(f"num_runtimes must be positive, but got {num_runtimes}")
 
-        self.DLL = self._DLLWrapper(lib_path, num_runtimes, allocator_kind)
-        self.handle = self.DLL.handle
-        self.lib_path = self.DLL.lib_path
+        self.DLL = self._DLLWrapper(lib_path)
+        self.lib_path = lib_path
+        self.handle = ctypes.c_void_p()
+        self.allocator_handle = ctypes.c_void_p()
+        if allocator_kind is not None:
+            self.DLL.AITemplateAllocatorCreate(
+                ctypes.byref(self.allocator_handle),
+                ctypes.c_int(allocator_kind.value),
+            )
+
+        self.DLL.AITemplateModelContainerCreate(
+            ctypes.pointer(self.handle),
+            ctypes.c_size_t(num_runtimes),
+            self.allocator_handle,
+        )
 
         # We use this list to add reference counts of Torch tensors
         # to avoid lifetime issues caused by user misuse.
@@ -242,11 +255,6 @@ def __init__(
             for i in range(len(self._output_name_to_index))
         ]
 
-        # Set of pointers allocated with numpy_to_ait_data.
-        # If the user forgets to free their data, we use this to
-        # avoid leaking memory.
-        self._allocated_ait_data = set()
-
     def __enter__(self):
         return self
 
@@ -260,7 +268,19 @@ def close(self):
         # Copy to avoid set size changed during iteration
         for ptr in list(self._allocated_ait_data):
             self.free_gpu_memory(ptr, sync=True)
-        self.DLL.close()
+
+        # Check that it exists since we may have thrown
+        # an exception before initializing it.
+        if hasattr(self, "DLL"):
+            if self.handle:
+                self.DLL.AITemplateModelContainerDelete(self.handle)
+                self.handle = ctypes.c_void_p()
+
+            if self.allocator_handle:
+                self.DLL.AITemplateAllocatorDelete(self.allocator_handle)
+                self.allocator_handle = ctypes.c_void_p()
+
+            self.DLL.close()
 
     def __getstate__(self):
         return {"lib_path": self.DLL.lib_path}
@@ -323,7 +343,7 @@ def _dict_to_ordered_list(self, params, is_inputs):
                 f"Did not get correct number of {'inputs' if is_inputs else 'outputs'} expected {len(index_map)}, got {len(params)}"
             )
 
-        result = [None for i in range(len(index_map))]
+        result = [None] * len(index_map)
         for name, tensor in params.items():
             if name not in index_map:
                 raise ValueError(
@@ -334,9 +354,46 @@ def _dict_to_ordered_list(self, params, is_inputs):
 
         return result
 
+    def _write_tensors_for_standalone_testcase(
+        self,
+        tensor_dict: Dict[str, TorchTensor],
+        file_handle,
+        is_inputs: bool = True,
+    ) -> None:
+        if is_inputs:
+            index_map = self._input_name_to_index
+        else:
+            index_map = self._output_name_to_index
+        result = [None] * len(index_map)
+        for name, tensor in tensor_dict.items():
+            if name not in index_map:
+                raise ValueError(
+                    f"Got unexpected {'input' if is_inputs else 'output'}: {name}"
+                )
+            idx = index_map[name]
+            result[idx] = tensor
+        for tensor in result:
+            write_tensor_binary(tensor, file_handle)
+
+    def write_standalone_testcase_data(
+        self,
+        filename,
+        inputs: Dict[str, TorchTensor],
+        expected_outputs: List[TorchTensor],
+        atol=1e-2,
+        rtol=1e-2,
+    ):
+        with open(filename, "wb") as file_handle:
+            file_handle.write(struct.pack("ff", atol, rtol))
+            self._write_tensors_for_standalone_testcase(
+                tensor_dict=inputs, file_handle=file_handle
+            )
+            for out in expected_outputs:
+                write_tensor_binary(out, file_handle)
+
     def _make_ait_outputs(
         self, outputs: List[AITData], c_output_shapes
-    ) -> Dict[str, List[int]]:
+    ) -> Dict[str, AITData]:
         output_shapes = []
         for i, c_shape in enumerate(c_output_shapes):
             shape = []
@@ -434,6 +491,58 @@ def run(
             inputs, outputs, stream_ptr, sync, graph_mode, outputs_on_host=False
         )
 
+    def profile(
+        self,
+        inputs: Union[Dict[str, AITData], List[AITData]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
+        num_iters: int,
+        filename: str,
+        stream_ptr: Optional[int] = None,
+    ) -> None:
+        if isinstance(inputs, dict):
+            inputs = self._dict_to_ordered_list(inputs, is_inputs=True)
+        if isinstance(outputs, dict):
+            outputs = self._dict_to_ordered_list(outputs, is_inputs=False)
+        (c_inputs, c_outputs, c_stream, c_output_shapes_out,) = self._prepare_run(
+            inputs,
+            outputs,
+            stream_ptr,
+        )
+        self.DLL.AITemplateModelContainerProfile(
+            self.handle,
+            c_inputs,
+            ctypes.c_size_t(len(inputs)),
+            c_outputs,
+            ctypes.c_size_t(len(outputs)),
+            c_stream,
+            ctypes.c_size_t(num_iters),
+            ctypes.c_char_p(filename.encode("utf-8")),
+        )
+
+    def profile_with_tensors(
+        self,
+        inputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
+        num_iters: int,
+        filename: str,
+        stream_ptr: Optional[int] = None,
+    ) -> None:
+        _check_tensors_contiguous_and_on_gpu(
+            inputs,
+            name="inputs",
+        )
+        _check_tensors_contiguous_and_on_gpu(
+            outputs,
+            name="outputs",
+        )
+        self.profile(
+            _convert_tensor_args(inputs),
+            _convert_tensor_args(outputs),
+            num_iters,
+            filename,
+            stream_ptr,
+        )
+
     def _interpret_tensors_as_shapes(
         self,
         outputs: Union[List[TorchTensor], Dict[str, TorchTensor]],
@@ -488,7 +597,7 @@ def run_with_tensors(
     def _run_with_outputs_on_host(
         self,
         inputs: Union[Dict[str, AITData], List[AITData]],
-        outputs: Union[Dict[str, int], List[int]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
         stream_ptr: Optional[int] = None,
         graph_mode: bool = False,
     ) -> Dict[str, AITData]:
@@ -497,7 +606,7 @@ def _run_with_outputs_on_host(
         the stream will always be synchronized after copying the outputs to the host.
 
         Warning: don't use this! It's not optimal with respect to performance.
-        It's here for use by internal constant folding passes.
+        It's here for use if you need it for debugging purpose.
         """
         return self._run_impl(
             inputs, outputs, stream_ptr, graph_mode=graph_mode, outputs_on_host=True
@@ -514,7 +623,7 @@ def _run_with_tensors_outputs_on_host(
         Like RunWithTensors(), but takes host memory tensors
 
         Warning: don't use this! It's not optimal with respect to performance.
-        It's here for use by internal constant folding passes.
+        It's here for use if you need it for debugging.
         """
         _check_tensors_contiguous_and_on_gpu(
             inputs,
@@ -535,7 +644,7 @@ def _run_with_tensors_outputs_on_host(
     def benchmark(
         self,
         inputs: Union[Dict[str, AITData], List[AITData]],
-        outputs: Union[Dict[str, int], List[int]],
+        outputs: Union[Dict[str, AITData], List[AITData]],
         stream_ptr: Optional[int] = None,
         graph_mode: bool = False,
         count: int = 10,
@@ -684,6 +793,97 @@ def set_constant(self, name: str, tensor: AITData):
             self.handle, c_name, ctypes.byref(c_tensor)
         )
 
+    def set_many_constants(self, tensors: Dict[str, AITData]):
+        """
+        Bulk set many constants at once. More efficient than set_constant()
+        since it only has to acquire the lock once.
+        """
+        c_names = (ctypes.c_char_p * len(tensors))()
+        c_tensors = (_CFormatAITData * len(tensors))()
+        ait_tensors = {
+            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
+            for name, tensor in tensors.items()
+        }
+        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
+            c_names[i] = ctypes.c_char_p(name_bytes)
+            c_tensors[i] = tensor
+
+        num_tensors = ctypes.c_size_t(len(tensors))
+        self.DLL.AITemplateModelContainerSetManyConstants(
+            self.handle, c_names, c_tensors, num_tensors
+        )
+
+    def set_double_buffer_constant(
+        self, name: str, tensor: AITData, stream_ptr: Optional[int] = None
+    ):
+        """
+        Set a constant. All constants must have values before calling run().
+
+        Note that the pointer inside tensor must be valid for the entire
+        duration of run().
+        """
+        b_name = name.encode("utf-8")
+        c_name = ctypes.c_char_p(b_name)
+        c_tensor = self._convert_single_param_to_c_format(tensor)
+        self.DLL.AITemplateModelContainerSetDoubleBufferConstant(
+            self.handle, ctypes.c_void_p(stream_ptr), c_name, ctypes.byref(c_tensor)
+        )
+
+    def set_many_double_buffer_constants(
+        self, tensors: Dict[str, AITData], stream_ptr: Optional[int] = None
+    ):
+        """
+        Bulk set many constants at once. More efficient than set_constant()
+        since it only has to acquire the lock once.
+        """
+        c_names = (ctypes.c_char_p * len(tensors))()
+        c_tensors = (_CFormatAITData * len(tensors))()
+        ait_tensors = {
+            name.encode("utf-8"): self._convert_single_param_to_c_format(tensor)
+            for name, tensor in tensors.items()
+        }
+        for i, (name_bytes, tensor) in enumerate(ait_tensors.items()):
+            c_names[i] = ctypes.c_char_p(name_bytes)
+            c_tensors[i] = tensor
+
+        num_tensors = ctypes.c_size_t(len(tensors))
+        self.DLL.AITemplateModelContainerSetManyDoubleBufferConstants(
+            self.handle, ctypes.c_void_p(stream_ptr), c_names, c_tensors, num_tensors
+        )
+
+    def set_many_constants_with_tensors(self, tensors: Dict[str, TorchTensor]):
+        ait_tensors = {}
+        for name, tensor in tensors.items():
+            if not tensor.is_contiguous() or not tensor.is_cuda:
+                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+            self.torch_constant_tensors[name] = tensor
+            ait_tensors[name] = torch_to_ait_data(tensor)
+        self.set_many_constants(ait_tensors)
+
+    def set_double_buffer_constant_with_tensor(
+        self, name: str, tensor: TorchTensor, stream_ptr: Optional[int] = None
+    ):
+        """
+        Set a constant with a PyTorch tensor.
+        Model will store a reference to the given tensor in
+        torch_constant_tensors until it is explicitly deleted or replaced.
+        """
+        if not tensor.is_contiguous() or not tensor.is_cuda:
+            raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+        self.torch_constant_tensors[name] = tensor
+        self.set_double_buffer_constant(name, torch_to_ait_data(tensor), stream_ptr)
+
+    def set_many_double_buffer_constants_with_tensors(
+        self, tensors: Dict[str, TorchTensor], stream_ptr: Optional[int] = None
+    ):
+        ait_tensors = {}
+        for name, tensor in tensors.items():
+            if not tensor.is_contiguous() or not tensor.is_cuda:
+                raise ValueError(f"Constant {name} must be contiguous and on the GPU.")
+            self.torch_constant_tensors[name] = tensor
+            ait_tensors[name] = torch_to_ait_data(tensor)
+        self.set_many_double_buffer_constants(ait_tensors, stream_ptr)
+
     def set_constant_with_tensor(self, name: str, tensor: TorchTensor):
         """
         Set a constant with a PyTorch tensor.
@@ -849,3 +1049,55 @@ def ait_data_to_numpy(
             stream_ptr=stream_ptr,
         )
         return arr
+
+    def fold_constants(
+        self,
+        stream_ptr: Optional[int] = None,
+        sync: bool = True,
+        double_buffer: bool = False,
+    ):
+        if double_buffer:
+            self.DLL.AITemplateModelContainerFoldConstantsInDoubleBuffer(
+                self.handle,
+                ctypes.c_void_p(stream_ptr),
+                ctypes.c_bool(sync),
+            )
+        else:
+            self.DLL.AITemplateModelContainerFoldConstants(
+                self.handle,
+                ctypes.c_void_p(stream_ptr),
+                ctypes.c_bool(sync),
+            )
+
+    def swap_constants(self):
+        self.DLL.AITemplateModelContainerSwapConstants(self.handle)
+
+    def _get_constant_names_impl(
+        self, unbound_constants_only: bool, constant_folding_only: bool
+    ) -> List[str]:
+        num_constants = ctypes.c_size_t()
+        constant_folding_inputs_only = ctypes.c_bool(constant_folding_only)
+        unbound_constants_only_ = ctypes.c_bool(unbound_constants_only)
+        self.DLL.AITemplateModelContainerGetNumConstants(
+            self.handle,
+            unbound_constants_only_,
+            constant_folding_inputs_only,
+            ctypes.byref(num_constants),
+        )
+        names = (ctypes.c_char_p * num_constants.value)()
+        self.DLL.AITemplateModelContainerGetConstantNames(
+            self.handle, unbound_constants_only_, constant_folding_inputs_only, names
+        )
+        return [name.decode("utf-8") for name in names]
+
+    def get_constant_names(
+        self, unbound_constants_only: bool = True, constant_folding_only: bool = False
+    ) -> List[str]:
+        return self._get_constant_names_impl(
+            unbound_constants_only, constant_folding_only
+        )
+
+    def get_constant_folding_input_names(
+        self, unbound_constants_only: bool = True
+    ) -> List[str]:
+        return self._get_constant_names_impl(unbound_constants_only, True)
diff --git a/python/aitemplate/compiler/ops/__init__.py b/python/aitemplate/compiler/ops/__init__.py
index 0d78fae7f..8752001a4 100644
--- a/python/aitemplate/compiler/ops/__init__.py
+++ b/python/aitemplate/compiler/ops/__init__.py
@@ -16,19 +16,21 @@
 """
 AIT operators.
 """
-from .common import *
-from .conv import *
-from .embedding import *
-from .gemm_special import *
-from .gemm_universal import *
-from .gemm_epilogue_vistor import *
-from .layernorm import *
-from .padding import *
-from .pool import *
-from .reduce import *
-from .softmax import *
-from .tensor import *
-from .upsample import *
-from .vision_ops import *
-from .attention import *
-from .groupnorm import *
+from aitemplate.compiler.ops.common import *
+from aitemplate.compiler.ops.conv import *
+from aitemplate.compiler.ops.embedding import *
+from aitemplate.compiler.ops.gemm_special import *
+from aitemplate.compiler.ops.gemm_universal import *
+from aitemplate.compiler.ops.gemm_epilogue_vistor import *
+from aitemplate.compiler.ops.jagged import *
+from aitemplate.compiler.ops.layernorm import *
+from aitemplate.compiler.ops.padding import *
+from aitemplate.compiler.ops.pool import *
+from aitemplate.compiler.ops.reduce import *
+from aitemplate.compiler.ops.softmax import *
+from aitemplate.compiler.ops.tensor import *
+from aitemplate.compiler.ops.upsample import *
+from aitemplate.compiler.ops.vision_ops import *
+from aitemplate.compiler.ops.attention import *
+from aitemplate.compiler.ops.groupnorm import *
+from aitemplate.compiler.ops.b2b_bmm import *
diff --git a/python/aitemplate/compiler/ops/attention/__init__.py b/python/aitemplate/compiler/ops/attention/__init__.py
index 4f18558f7..ff60a7246 100644
--- a/python/aitemplate/compiler/ops/attention/__init__.py
+++ b/python/aitemplate/compiler/ops/attention/__init__.py
@@ -15,8 +15,8 @@
 """
 flash attention module init
 """
-from .flash_attention import flash_attention
-from .mem_eff_attention import mem_eff_attention
+from aitemplate.compiler.ops.attention.flash_attention import flash_attention
+from aitemplate.compiler.ops.attention.mem_eff_attention import mem_eff_attention
 
 
 __all__ = ["flash_attention", "mem_eff_attention"]
diff --git a/python/aitemplate/compiler/ops/attention/flash_attention.py b/python/aitemplate/compiler/ops/attention/flash_attention.py
index f81a59543..3b0658867 100644
--- a/python/aitemplate/compiler/ops/attention/flash_attention.py
+++ b/python/aitemplate/compiler/ops/attention/flash_attention.py
@@ -21,10 +21,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -60,7 +60,7 @@ class flash_attention(Operator):
     """
 
     def __init__(self, batch_size, dropout, max_seq_len, causal) -> None:
-        """initilize attention module"""
+        """Initialize attention module"""
         super().__init__()
         assert dropout == 0
         self._attrs["op"] = "flash_attention"
diff --git a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
index 7964cdc39..9bc58de3a 100644
--- a/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
+++ b/python/aitemplate/compiler/ops/attention/mem_eff_attention.py
@@ -16,16 +16,18 @@
 Flash attention.
 """
 import itertools
+import logging
 from collections import OrderedDict
-from typing import List
+from typing import List, Optional, Tuple
 
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+_LOGGER = logging.getLogger(__name__)
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -58,16 +60,26 @@ class mem_eff_attention(Operator):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
     """
 
-    def __init__(self, causal, dropout=0) -> None:
-        """initilize attention module"""
+    def __init__(
+        self,
+        causal,
+        dropout=0,
+        variable_seq_length_kv=False,
+        variable_seq_length_q=False,
+        use_grouped_fmha=False,
+    ) -> None:
+        """Initialize attention module"""
         super().__init__()
         assert dropout == 0
         self._attrs["op"] = "mem_eff_attention"
         self._attrs["has_profiler"] = False
         self._attrs["dropout"] = dropout
         self._attrs["causal"] = causal
+        self._attrs["variable_seq_length_kv"] = variable_seq_length_kv
+        self._attrs["variable_seq_length_q"] = variable_seq_length_q
         self._attrs["head_size"] = -1
         self._attrs["workspace"] = 0
+        self._attrs["use_grouped_fmha"] = use_grouped_fmha
         self.exec_key_template = EXEC_KEY_TEMPLATE
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
@@ -101,26 +113,36 @@ def _infer_shapes(self, x: Tensor, w: Tensor):
             y_shape = self._infer_shape(x_shape, w_shape)
             y_shapes.append(y_shape)
 
-        def unique(vector):
-            return sorted(set(vector))
-
         batch_info = x._attrs["shape"][0]
         output_shape = [
             batch_info,
-            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
-            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            x._attrs["shape"][2],
+            x._attrs["shape"][1],
+            w._attrs["shape"][-1],
         ]
         return output_shape
 
-    def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        lengths_kv: Optional[Tensor] = None,
+        lengths_q: Optional[Tensor] = None,
+    ) -> Tensor:
         """call the op
 
         Parameters
         ----------
-        qkv : float16
-            QKV tensor
-            shape: (b, seqlen, num_heads, Kv)
+        q : float16
+            Q tensor
+            shape: (b, seqlen, num_heads, head_dim)
+        k : float16
+            K tensor
+            shape: (b, seqlen, num_heads, head_dim)
+        v : float16
+            V tensor
+            shape: (b, seqlen, num_heads, head_dim)
 
         Returns
         ----------
@@ -131,17 +153,79 @@ def __call__(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
         self._attrs["head_size"] = head_size_v
 
         self._attrs["inputs"] = [q, k, v]
+        if lengths_kv:
+            self._attrs["inputs"].append(lengths_kv)
+        if lengths_q:
+            self._attrs["inputs"].append(lengths_q)
         self._set_depth()
         self._extract_exec_path(q)
         output_shape = self._infer_shapes(q, v)
 
-        o_shape = [var._attrs["values"][-1] for var in output_shape]
-        if o_shape[-1] > 128:
-            self._attrs["workspace"] = 4 * np.prod(o_shape)
-        output = Tensor(output_shape, src_ops={self})
+        required_workspace_size = self._compute_required_workspace(
+            output_shape, q._attrs["shape"], k._attrs["shape"]
+        )
+        self._attrs["workspace"] = required_workspace_size
+        _LOGGER.debug(f"Required workspace size: {required_workspace_size}")
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
+    def _compute_required_workspace(
+        self,
+        output_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+        q_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+        k_shape: Tuple[IntVar, IntVar, IntVar, IntVar],
+    ) -> int:
+        """
+        Compute workspace size required for attention op.
+        """
+        is_float32 = self._attrs["inputs"][0]._attrs["dtype"] not in [
+            "float16",
+            "bfloat16",
+        ]
+
+        o_shape = [var._attrs["values"][-1] for var in output_shape]
+        # We need a separate buffer of output accumulation
+        # - when the intermediate output can't fit into the register file.
+        # - when the accumulation type (float) is different from the output type.
+        # See https://github.com/NVIDIA/cutlass/blob/209faf7b94ce4ba573d27389fb643962e75d0581/examples/41_fused_multi_head_attention/fmha_grouped.h#L79-L95
+        needs_output_accum_buffer = (o_shape[-1] > 128) or not is_float32
+        if needs_output_accum_buffer:  # Needs output accumulator buffer
+            size_of_accum_element = 4  # Accumulation is always in float
+            accu_size = size_of_accum_element * np.prod(o_shape)
+        else:
+            accu_size = 0
+
+        # The backend which uses kernel_forward.h only needs accumulator buffer
+        if not self._attrs["use_grouped_fmha"]:
+            return accu_size
+
+        # Number of problems is batch_size * num_heads
+        problem_count = q_shape[0].upper_bound() * q_shape[1].upper_bound()
+
+        size_of_int = 4
+        size_of_int64 = 8
+        # GEMM size is specified by 3 ints: m, n, k
+        size_of_gemm_coord = 3 * size_of_int
+
+        # There are two GEMM sizes for each problem, corresponding to 2 matrix
+        # multiplications in attention
+        problem_sizes_size = 2 * size_of_gemm_coord * problem_count
+
+        # For each problem, need space for leading dimensions of 5 matrices:
+        # Q, K, V, O. Leading dimensions are in int64.
+        ld_sizes = 4 * size_of_int64 * problem_count
+
+        # For each problem, pointers to 5 matrices: Q, K, V, O, O_accum
+        size_of_ptr = 8  # 64-bit arch
+        ptrs_sizes = 5 * size_of_ptr * problem_count
+        total_size = problem_sizes_size + accu_size + ld_sizes + ptrs_sizes
+        return total_size
+
     def _get_op_attributes(self):
         target_attrs = ["causal"]
         attr = {}
@@ -172,6 +256,7 @@ def _extract_exec_path(self, x: Tensor):
     def gen_function(self) -> str:
         """call backend functions"""
         target = backend.target.Target.current()
+        self._attrs["arch"] = target._arch
         func_key = "{target}.{op}.gen_function".format(
             target=target.name(), op=self._attrs["op"]
         )
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/__init__.py b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
new file mode 100644
index 000000000..d77a078b2
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/__init__.py
@@ -0,0 +1,27 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# flake8: noqa
+"""
+B2B Bmm ops.
+"""
+
+from aitemplate.compiler.ops.b2b_bmm.classic_b2b_bmm import classic_b2b_bmm
+from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import fmha_style_b2b_bmm
+from aitemplate.compiler.ops.b2b_bmm.grouped_classic_b2b_bmm import (
+    grouped_classic_b2b_bmm,
+)
+from aitemplate.compiler.ops.b2b_bmm.grouped_fmha_style_b2b_bmm import (
+    grouped_fmha_style_b2b_bmm,
+)
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
new file mode 100644
index 000000000..9b979a307
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/b2b_bmm_base.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Base class for back-to-back batched gemm fused kernels.
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+where:
+Q: [B, M0, (H,) K0] (row_major),
+K: [B, N0, (H,) K0] (column_major),
+V: [B, N0, (H,) N1] (row_major),
+bias: [B, (H,) M0, N0] (row_major).
+Layouts are fixed for now.
+"""
+
+from enum import Enum
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator
+from aitemplate.utils.alignment import find_max_alignment, get_alignments
+
+
+def _check_max_alignment(shape: IntVar, dtype: str, error_msg: str) -> None:
+    if not isinstance(shape, IntImm):
+        raise RuntimeError(f"{shape=} must be IntImm! ", error_msg)
+    res = find_max_alignment(shape.value(), dtype) == max(get_alignments(dtype))
+    if not res:
+        raise RuntimeError(
+            f"{shape=} does not satisfy {dtype=} max alignment requirements! ",
+            error_msg,
+        )
+
+
+class CausalType(Enum):
+    NO_CAUSAL = 0  # no causal mask
+    UPPER_RIGHT_EMPTY = 1  # upper right triangular part of the matrix is 0
+    LOWER_LEFT_EMPTY = 2  # bottom left triangular part of the matrix is 0
+
+
+class b2b_bmm_base(Operator):
+    r"""Base class for back-to-back batched gemm fused kernels.
+
+    Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+    Args:
+    * causal_type (CausalType): Type of causal_mask. See comments above.
+    * epilogue_math_name (str): Name of the activation function.
+      Supported epilogue functions can be found from
+      python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+    * alpha0 (float): See the math function above.
+    * alpha1 (float): See the math function above.
+    * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+      Useful when seq_len is a dynamic value so that alpah1 cannot be
+      computed in advance.
+    """
+
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        """Initialize classic_b2b_bmm op."""
+        super().__init__()
+        self._attrs["has_profiler"] = False
+        self._attrs["causal_type"] = causal_type
+        self._attrs["alpha0"] = alpha0
+        self._attrs["alpha1"] = alpha1
+        self._attrs["alpha1_divide_by_seq_len"] = alpha1_divide_by_seq_len
+
+        import cutlass_lib
+
+        if epilogue_math_name not in cutlass_lib.library.EpilogueMathName:
+            raise RuntimeError(
+                "Unsupported epilogue function! Please check "
+                "python/aitemplate/utils/mk_cutlass_lib/extra_enum.py for a list of supported epilogue functions."
+            )
+        self._attrs["epilogue_math_name"] = epilogue_math_name
+
+    def _check_alignment(self) -> None:
+        q, k, v = self._attrs["inputs"][0:3]
+        if (
+            q._attrs["dtype"] != k._attrs["dtype"]
+            or q._attrs["dtype"] != v._attrs["dtype"]
+        ):
+            raise RuntimeError(
+                "QKV dtypes must be the same! "
+                f"QKV dtypes: {q._attrs['dtype']=}, {k._attrs['dtype']=}, {v._attrs['dtype']=}"
+            )
+        dtype = q._attrs["dtype"]
+
+        _check_max_alignment(q._attrs["shape"][-1], dtype, f"{q._attrs['shape']=}")
+        _check_max_alignment(k._attrs["shape"][-1], dtype, f"{k._attrs['shape']=}")
+        _check_max_alignment(v._attrs["shape"][-1], dtype, f"{v._attrs['shape']=}")
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
new file mode 100644
index 000000000..fb1a93129
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/classic_b2b_bmm.py
@@ -0,0 +1,247 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel.
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+Notation:
+B: batch size
+H: number of heads
+
+If inputs/outputs have three dims ( singlehead case ):
+Q: [B, M0, K0] (row_major),
+K: [B, N0, K0] (column_major),
+V: [B, N0, N1] (row_major),
+bias: [B, M0, N0] (row_major).
+output: [ B, M0, N1 ]
+
+If inputs/outputs have four dims ( multihead case ),
+the head dim is located at the dimension with index 2
+
+dimension order of the parameters is
+
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major).
+Output: [ B, M0, H, N1 ]
+
+Only supports NO_CAUSAL or LOWER_LEFT_EMPTY causal mask types.
+When causal_mask is enabled, M0 must be equal to N0.
+
+Internally, it stores the results of Q@K in registers without writing them to shared memory, which is faster.
+However, N0 and N1 must be <= 512.
+"""
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+
+
+class classic_b2b_bmm(b2b_bmm_base):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        r"""Back-to-back batched gemm fused kernels.
+
+        More detailed documentation at the top of this file.
+
+        Args:
+        * causal_type (CausalType): Type of causal_mask. See comments above.
+        * epilogue_math_name (str): Name of the activation function.
+        Supported epilogue functions can be found from
+        python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+        * alpha0 (float): See the math function above.
+        * alpha1 (float): See the math function above.
+        * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+        Useful when seq_len is a dynamic value so that alpah1 cannot be
+        computed in advance.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
+        self._attrs["op"] = "classic_b2b_bmm"
+        if (
+            causal_type != CausalType.NO_CAUSAL
+            and causal_type != CausalType.LOWER_LEFT_EMPTY
+        ):
+            raise NotImplementedError(
+                f"classic_b2b_bmm only supports NO_CAUSAL or LOWER_LEFT_EMPTY. Current causal type: {causal_type}"
+            )
+
+    def _infer_shapes(self):
+        """infer the output shape for classic_b2b_bmm."""
+        q, k, v, bias = self._attrs["inputs"]
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        head_dim = 2
+        seq_dim = 1
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3 and len(k_shape) != 4:
+            raise RuntimeError(
+                f"QKV must have rank 3 or 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        batch_size = q_shape[0]
+        M0 = q_shape[seq_dim]
+        K0 = q_shape[-1]
+        if K0 != k_shape[-1]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N0 = k_shape[seq_dim]
+        if N0 != v_shape[seq_dim]:
+            raise RuntimeError(
+                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N1 = v_shape[-1]
+        if N0.upper_bound() > 512 or N1.upper_bound() > 512:
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports <=512 N0 / N1. Current length: {N0=}, {N1=}"
+            )
+        if not isinstance(N0, IntImm) or not isinstance(N1, IntImm):
+            raise RuntimeError(
+                f"classic_b2b_bmm only supports static N0 / N1. Current {N0=}, {N1=}."
+            )
+        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
+            if M0 != N0:
+                raise RuntimeError(
+                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
+                )
+        bias_shape = bias._attrs["shape"]
+
+        is_multihead = len(q_shape) == 4
+        if is_multihead:
+            num_heads = q_shape[head_dim]
+
+            output_shape = [batch_size, M0, num_heads, N1]
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Was expecting 4-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(
+                bias_shape, [batch_size, num_heads, M0, N0]
+            ):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
+            # key sequence length is identical to last shape dim of bias tensor
+            # so if it is also constant 1, it is not a real broadcast and permissible
+            if bias_shape[-1] == IntImm(1) and k_shape[seq_dim] != IntImm(1):
+                raise RuntimeError(
+                    "classic_b2b_bmm op does not support broadcasting of last dimension of bias tensor (e.g. over sequence length of key and value ). Use the expand op to emulate this broadcast behavior if you need it."
+                )
+        else:
+            num_heads = IntImm(1)
+            self._attrs["num_heads"] = num_heads
+            output_shape = [batch_size, M0, N1]
+            if len(bias_shape) != 3:
+                raise RuntimeError(
+                    f"Was expecting 3-dimensional bias based on q dimensionality. {len(bias_shape)=} {len(q_shape)=}"
+                )
+            for bias_dim, expected_dim in zip(bias_shape, [batch_size, M0, N0]):
+                if bias_dim != IntImm(1) and bias_dim != expected_dim:
+                    raise RuntimeError(
+                        f"bias shape is not compatible with Q K! "
+                        f"QKV shapes: {q_shape=}, {num_heads=}, {k_shape=}, {v_shape=}, "
+                        f"bias shapes: {bias_shape=}."
+                    )
+
+        return output_shape
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Tensor,
+    ) -> Tensor:
+        """call the op
+
+        Note: [H,] means optional num-heads,
+        if it exists for one input tensor, all need to have it,
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, [H,] K0)
+        k: Tensor, shape(B, N0, [H,] K0)
+        v: Tensor, shape(B, N0, [H,] N1)
+        bias: Tensor, shape(B, [H,] M0, N0)
+
+        Returns
+        ----------
+        Tensor, shape(B, M0, [H,], N1)
+        """
+
+        self._attrs["inputs"] = [q, k, v, bias]
+        self._set_depth()
+        output_shape = self._infer_shapes()
+        self._check_alignment()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        if current_target.name() == "rocm" or (
+            current_target.name() == "cuda" and int(current_target._arch) < 80
+        ):
+            raise NotImplementedError(
+                "classic_b2b_bmm is only supported by CUDA>=SM80 devices."
+            )
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..85023e164
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/fmha_style_b2b_bmm.py
@@ -0,0 +1,200 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel, implemented in FMHA style.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) [+ bias]))), V),
+
+where:
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major). Bias can be omitted.
+Layouts are fixed for now.
+
+causal_masks have 3 types:
+NO_CAUSAL: no causal masks
+UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
+LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
+When causal_masks is enabled, M0 must be equal to N0.
+
+Internally this implementation stores the results of Q@K in shared memory.
+It supports larger N0 / N1 compared to the classic_b2b_bmm implementation.
+"""
+
+from typing import Optional
+
+import numpy as np
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+from aitemplate.utils import shape_utils
+
+
+class fmha_style_b2b_bmm(b2b_bmm_base):
+    """See comments at the head of this file."""
+
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        """Initialize fmha_style_b2b_bmm op.
+        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
+        about these args.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
+        self._attrs["op"] = "fmha_style_b2b_bmm"
+        self._attrs["workspace"] = 0
+
+    def _infer_shapes(self):
+        """infer the output shape for fmha_style_b2b_bmm."""
+        q, k, v = self._attrs["inputs"][0:3]
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 4:
+            raise RuntimeError(
+                f"QKV must have rank == 4! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same batch size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[2] != k_shape[2] or q_shape[2] != v_shape[2]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        batch_size = q_shape[0]
+        M0 = q_shape[1]
+        K0 = q_shape[3]
+        if K0 != k_shape[3]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N0 = k_shape[1]
+        if N0 != v_shape[1]:
+            raise RuntimeError(
+                f"K V shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        N1 = v_shape[3]
+
+        if self._attrs["causal_type"] != CausalType.NO_CAUSAL:
+            if M0 != N0:
+                raise RuntimeError(
+                    f"When causal_type is enabled, M0 must be equal to N0. Current {M0=}, {N0=}."
+                )
+
+        head_size = q_shape[2]
+        output_shape = [batch_size, M0, head_size, N1]
+
+        if len(self._attrs["inputs"]) == 4:
+            bias = self._attrs["inputs"][3]
+            bias_shape = bias._attrs["shape"]
+            bias_expected_shape = [batch_size, head_size, M0, N0]
+            broadcastable, _ = shape_utils.get_broadcast_max_shape(
+                bias_shape, bias_expected_shape
+            )
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Expected bias rank 4. Current bias rank: {len(bias_shape)}."
+                )
+            if not broadcastable:
+                raise RuntimeError(
+                    f"bias shape is not compatible with Q K! "
+                    f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                    f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
+                )
+            if bias_shape[-1] != N0:
+                raise RuntimeError(
+                    f"Bias last dim is not broadcastable! Expected shape: {N0}, current bias shape: {bias_shape}"
+                )
+        return output_shape
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Optional[Tensor] = None,
+    ) -> Tensor:
+        """call the op
+
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, H, K0)
+        k: Tensor, shape(B, N0, H, K0)
+        v: Tensor, shape(B, N0, H, N1)
+        bias: Tensor, shape(B, H, M0, N0), optional
+
+        Returns
+        ----------
+        Tensor, shape(B, H, M0, N1)
+        """
+
+        if bias is not None:
+            self._attrs["inputs"] = [q, k, v, bias]
+        else:
+            self._attrs["inputs"] = [q, k, v]
+        self._set_depth()
+        output_shape = self._infer_shapes()
+        self._check_alignment()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+        o_shape = [var.upper_bound() for var in output_shape]
+        if o_shape[-1] > 128:
+            self._attrs["workspace"] = 4 * np.prod(o_shape)
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..19042285e
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_classic_b2b_bmm.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Back-to-back batched gemm fused kernel.
+Computes bmm(causal_mask(alpha1 * (activation(alpha0 * bmm(Q, K) + bias))), V),
+
+Notation:
+B: batch size
+H: number of heads
+
+If inputs/outputs have three dims ( singlehead case ):
+Q: [B, M0, K0] (row_major),
+K: [B, N0, K0] (column_major),
+V: [B, N0, N1] (row_major),
+bias: [B, M0, N0] (row_major).
+output: [ B, M0, N1 ]
+
+If inputs/outputs have four dims ( multihead case ),
+the head dim is located at the dimension with index 2
+
+dimension order of the parameters is
+
+Q: [B, M0, H, K0] (row_major),
+K: [B, N0, H, K0] (column_major),
+V: [B, N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major).
+Output: [ B, M0, H, N1 ]
+
+Only supports NO_CAUSAL or LOWER_LEFT_EMPTY causal mask types.
+When causal_mask is enabled, M0 must be equal to N0.
+
+Internally, it stores the results of Q@K in registers without writing them to shared memory, which is faster.
+However, N0 and N1 must be <= 512.
+"""
+
+from aitemplate.backend import registry, target
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import b2b_bmm_base, CausalType
+from aitemplate.utils import shape_utils
+
+
+def _is_power_of_two(n):
+    if n <= 0:
+        return False
+    return (n & (n - 1)) == 0
+
+
+class grouped_classic_b2b_bmm(b2b_bmm_base):
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        r"""Back-to-back batched gemm fused kernels.
+
+        More detailed documentation at the top of this file.
+
+        Args:
+        * causal_type (CausalType): Type of causal_mask. See comments above.
+        * epilogue_math_name (str): Name of the activation function.
+        Supported epilogue functions can be found from
+        python/aitemplate/utils/mk_cutlass_lib/extra_enum.py.
+        * alpha0 (float): See the math function above.
+        * alpha1 (float): See the math function above.
+        * alpha1_divide_by_seq_len (bool) Whether divide alpha1 by seq_len.
+        Useful when seq_len is a dynamic value so that alpah1 cannot be
+        computed in advance.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
+        self._attrs["op"] = "grouped_classic_b2b_bmm"
+        if (
+            causal_type != CausalType.NO_CAUSAL
+            and causal_type != CausalType.LOWER_LEFT_EMPTY
+        ):
+            raise NotImplementedError(
+                f"grouped_classic_b2b_bmm only supports NO_CAUSAL or LOWER_LEFT_EMPTY. Current causal type: {causal_type}"
+            )
+
+    def _infer_shapes(self):
+        """infer the output shape for grouped_classic_b2b_bmm."""
+        q, k, v, bias = self._attrs["inputs"]
+        if not (q.is_jagged() and k.is_jagged() and v.is_jagged()):
+            raise RuntimeError(f"{q=}, {k=}, {v=} must be jagged!")
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        bias_shape = bias._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3:
+            raise RuntimeError(
+                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same jagged_dim (batch_size and seq_length)! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if q_shape[1] != k_shape[1] or q_shape[1] != v_shape[1]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[2] != k_shape[2]:
+            raise RuntimeError(
+                f"Q and K shapes are not compatible ( inner dimension for Matmul must be identical ) - Q shape: {q_shape=}, K shape: {k_shape=}."
+            )
+
+        batch_size = q_shape[0]
+        K0 = q_shape[-1]
+        if K0 != k_shape[-1]:
+            raise RuntimeError(
+                f"Q and K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        num_heads = q_shape[1]
+        output_shape = [q_shape[0], num_heads, v_shape[2]]
+
+        batch_size = q_shape[0].batch_dim()
+        max_seq_len = q_shape[0].jagged_dims()[0].max_value()
+        if isinstance(max_seq_len, IntVar):
+            if max_seq_len.lower_bound() != max_seq_len.upper_bound():
+                raise RuntimeError(
+                    "Maximum sequence length needs to be a fixed (IntImm) dimension. "
+                )
+            max_seq_len = max_seq_len.upper_bound()
+
+        # This is a current limitation of the classic op due to grid layout and test results
+        if (
+            (not _is_power_of_two(max_seq_len))
+            or (max_seq_len > 512)
+            or (max_seq_len < 64)
+        ):
+            raise RuntimeError(
+                f"Maximum sequence length needs to be a fixed (IntImm) dimension with a power of two between 64 and 512 for the grouped classic b2b op to work. Actual value: {max_seq_len=}. {type(max_seq_len)=}"
+            )
+        if len(bias_shape) != 4:
+            raise RuntimeError(f"Expected bias rank 4. Current bias rank: {len(bias)}.")
+
+        bias_expected_shape = [
+            batch_size,
+            num_heads,
+            max_seq_len,
+            max_seq_len,
+        ]
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+            bias_shape, bias_expected_shape
+        )
+        if not broadcastable:
+            raise RuntimeError(
+                f"bias shape is not compatible with Q K! "
+                f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
+            )
+        if bias_shape[-1] != bias_expected_shape[-1]:
+            raise RuntimeError(
+                f"Bias last dim is not broadcastable! Expected shape: {bias_expected_shape[-1]}, current bias shape: {bias_shape}"
+            )
+        return output_shape, max_seq_len
+
+    def __call__(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        bias: Tensor,
+    ) -> Tensor:
+        """call the op
+
+        Note: [H,] means optional num-heads,
+        if it exists for one input tensor, all need to have it,
+        Parameters
+        ----------
+        q: Tensor, shape(B, M0, [H,] K0)
+        k: Tensor, shape(B, N0, [H,] K0)
+        v: Tensor, shape(B, N0, [H,] N1)
+        bias: Tensor, shape(B, [H,] M0, N0)
+
+        Returns
+        ----------
+        Tensor, shape(B, M0, [H,], N1)
+        """
+
+        self._attrs["inputs"] = [q, k, v, bias]
+        self._set_depth()
+        output_shape, max_seq_len = self._infer_shapes()
+        self._check_alignment()
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+        self._attrs["max_seq_len"] = max_seq_len
+
+        return output
+
+    def _get_op_attributes(self):
+        target_attrs = [
+            "causal_type",
+            "epilogue_math_name",
+            "alpha0",
+            "alpha1",
+            "alpha1_divide_by_seq_len",
+        ]
+        attr = {}
+
+        for target_attr in target_attrs:
+            if target_attr in self._attrs:
+                attr[target_attr] = self._attrs[target_attr]
+
+        return attr
+
+    def gen_function(self) -> str:
+        """call backend functions"""
+        current_target = target.Target.current()
+        if current_target.name() == "rocm" or (
+            current_target.name() == "cuda" and int(current_target._arch) < 80
+        ):
+            raise NotImplementedError(
+                "grouped_classic_b2b_bmm is only supported by CUDA>=SM80 devices."
+            )
+        func_key = "{target}.{op}.gen_function".format(
+            target=current_target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
new file mode 100644
index 000000000..50d7be6d1
--- /dev/null
+++ b/python/aitemplate/compiler/ops/b2b_bmm/grouped_fmha_style_b2b_bmm.py
@@ -0,0 +1,164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Grouped back-to-back batched gemm fused kernel, implemented in FMHA style.
+Computes bmm(causal_masks(alpha1(activation(alpha0 * bmm(Q, K) [+ bias]))), V),
+
+where:
+Q: [B_M0, H, K0] (row_major),
+K: [B_N0, H, K0] (column_major),
+V: [B_N0, H, N1] (row_major),
+bias: [B, H, M0, N0] (row_major). Bias can be omitted.
+B_M0, B_N0 are jagged dims.
+Layouts are fixed for now.
+
+causal_masks have 3 types:
+NO_CAUSAL: no causal masks
+UPPER_RIGHT_EMPTY: the upper right triangular part of the matrix is 0
+LOWER_LEFT_EMPTY: the bottom left triangular part of the matrix is 0
+When causal_masks is enabled, M0 must be equal to N0.
+
+Internally this implementation stores the results of Q@K in shared memory.
+It supports larger N0 / N1 compared to the classic_b2b_bmm implementation.
+"""
+
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.fmha_style_b2b_bmm import (
+    CausalType,
+    fmha_style_b2b_bmm,
+)
+from aitemplate.utils import shape_utils
+
+
+class grouped_fmha_style_b2b_bmm(fmha_style_b2b_bmm):
+    """See comments at the head of this file."""
+
+    def __init__(
+        self,
+        causal_type: CausalType,
+        epilogue_math_name: str,
+        alpha0: float,
+        alpha1: float,
+        alpha1_divide_by_seq_len: bool = False,
+    ) -> None:
+        """Initialize grouped_fmha_style_b2b_bmm op.
+        Check aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base for more details
+        about these args.
+        """
+        super().__init__(
+            causal_type, epilogue_math_name, alpha0, alpha1, alpha1_divide_by_seq_len
+        )
+        self._attrs["op"] = "grouped_fmha_style_b2b_bmm"
+
+    def _infer_shapes(self):
+        """infer the output shape for grouped_fmha_style_b2b_bmm."""
+        q, k, v = self._attrs["inputs"][0:3]
+        if not (q.is_jagged() and k.is_jagged() and v.is_jagged()):
+            raise RuntimeError(f"{q=}, {k=}, {v=} must be jagged!")
+        q_shape = q._attrs["shape"]
+        k_shape = k._attrs["shape"]
+        v_shape = v._attrs["shape"]
+        if len(q_shape) != len(k_shape) or len(q_shape) != len(v_shape):
+            raise RuntimeError(
+                f"QKV ranks must be the same! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        if len(q_shape) != 3:
+            raise RuntimeError(
+                f"QKV must have rank == 3! Current rank: {len(q_shape)}, QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if q_shape[0] != k_shape[0] or q_shape[0] != v_shape[0]:
+            raise RuntimeError(
+                f"QKV must have same jagged_dim (batch_size and seq_length)! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        if len(q_shape[0].jagged_dims()) != 1:
+            raise RuntimeError(f"{len(q_shape[0].jagged_dims())=} must be 1!")
+
+        if q_shape[1] != k_shape[1] or q_shape[1] != v_shape[1]:
+            raise RuntimeError(
+                f"QKV must have same head size! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+        K0 = q_shape[2]
+        if K0 != k_shape[2]:
+            raise RuntimeError(
+                f"Q K shapes are not compatible! QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}."
+            )
+
+        num_heads = q_shape[1]
+        output_shape = [q_shape[0], num_heads, v_shape[2]]
+
+        if len(self._attrs["inputs"]) == 4:
+            batch_size = q_shape[0].batch_dim()
+            max_seq_length = q_shape[0].jagged_dims()[0].max_value()
+            bias = self._attrs["inputs"][3]
+            bias_shape = bias._attrs["shape"]
+            bias_expected_shape = [
+                batch_size,
+                num_heads,
+                max_seq_length,
+                max_seq_length,
+            ]
+            broadcastable, _ = shape_utils.get_broadcast_max_shape(
+                bias_shape, bias_expected_shape
+            )
+            if len(bias_shape) != 4:
+                raise RuntimeError(
+                    f"Expected bias rank 4. Current bias rank: {len(bias)}."
+                )
+            if not broadcastable:
+                raise RuntimeError(
+                    f"bias shape is not compatible with Q K! "
+                    f"QKV shapes: {q_shape=}, {k_shape=}, {v_shape=}, "
+                    f"bias shapes: {bias_shape=}, {bias_expected_shape=}."
+                )
+            if bias_shape[-1] != bias_expected_shape[-1]:
+                raise RuntimeError(
+                    f"Bias last dim is not broadcastable! Expected shape: {bias_expected_shape[-1]}, current bias shape: {bias_shape}"
+                )
+            # See comments below.
+            if not isinstance(q_shape[0].jagged_dims()[0].min_value(), IntImm):
+                raise RuntimeError(
+                    "Jagged dim' min value must be constant!"
+                    f"Current value: {q_shape[0].jagged_dims()=}"
+                )
+        else:
+            # Note: jagged_dims min / max values cannot be IntVar, as AIT lacks the feature to set
+            # "attributes" dynamically at runtime in general.
+            #
+            # Assuming the case: Q @ K @ V, Q / K / V are all dense tensor inputs.
+            # As a result, Q / K / V have total_length IntVar to represent the first dimension.
+            # Then there are make_jagged() ops which take Q / K / V as well as
+            # min_seq_len / max_seq_len IntVars as inputs.
+            # At runtime, Q / K / V are inputs passed to AIT runtime. However, since
+            # min_seq_len / max_seq_len is not bound to any input dimensions,
+            # there are no ways for AIT to infer these values. As a result, AIT compilation would
+            # fail.
+            #
+            # To support min_seq_len / max_seq_len IntVars, there must be a way dynamically set
+            # them at runtime.
+            #
+            # When bias is set, max_seq_len can be inferred from bias input.
+
+            if (not isinstance(q_shape[0].jagged_dims()[0].min_value(), IntImm)) or (
+                not isinstance(q_shape[0].jagged_dims()[0].max_value(), IntImm)
+            ):
+                raise RuntimeError(
+                    "Jagged dim' min / max values must be constant!"
+                    f"Current value: {q_shape[0].jagged_dims()=}"
+                )
+
+        return output_shape
diff --git a/python/aitemplate/compiler/ops/common/__init__.py b/python/aitemplate/compiler/ops/common/__init__.py
index 1247cc790..4e00e86d3 100644
--- a/python/aitemplate/compiler/ops/common/__init__.py
+++ b/python/aitemplate/compiler/ops/common/__init__.py
@@ -16,10 +16,10 @@
 """
 Common ops.
 """
-from .elementwise import *
-from .int_elementwise import *
-from .epilogue import *
-from .fused_elementwise import *
-from .math import *
-from .python_ops import *
-from .view_ops import *
+from aitemplate.compiler.ops.common.elementwise import *
+from aitemplate.compiler.ops.common.int_elementwise import *
+from aitemplate.compiler.ops.common.epilogue import *
+from aitemplate.compiler.ops.common.fused_elementwise import *
+from aitemplate.compiler.ops.common.math import *
+from aitemplate.compiler.ops.common.python_ops import *
+from aitemplate.compiler.ops.common.view_ops import *
diff --git a/python/aitemplate/compiler/ops/common/elementwise.py b/python/aitemplate/compiler/ops/common/elementwise.py
index dc18ba16e..a5bc5847a 100644
--- a/python/aitemplate/compiler/ops/common/elementwise.py
+++ b/python/aitemplate/compiler/ops/common/elementwise.py
@@ -18,14 +18,149 @@
 import functools
 from typing import Any, List
 
-from ....utils import shape_utils
-from ...base import IntVar, Operator, Tensor
-from ...op_registry import OP_REGISTRY
-from .epilogue import FuncEnum
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+from aitemplate.compiler.op_registry import OP_REGISTRY
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.common.int_elementwise import INT_ELEMENTWISE_FUNC
+
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
 
+def _discover_implicit_jagged_inputs(inputs: List[Tensor]):
+    """
+    Convert implicit jagged Tensor inputs into explicit jagged Tensors.
+
+    There may be cases when elementwise has both explicit jagged Tensor
+    inputs (i.e. with a JaggedIntVar as the first dimension in the shape)
+    and "implicit" jagged Tensor inputs (i.e. dense Tensors with the first
+    dimension == the JaggedIntVar.total_length() in the jagged Tensor
+    inputs). Here we detect such implicit jagged Tensor inputs and replace
+    the total_length: IntVar in the dense input's shape by the corresponding
+    JaggedIntVar from the jagged input's shape. Importantly, this must be
+    done before the mixed jagged / dense broadcasting takes place.
+    """
+    total_length_map = {}
+    for tensor in inputs:
+        if tensor.is_jagged():
+            jagged_int_var = tensor._attrs["shape"][0]
+            total_length = jagged_int_var.total_length()
+            total_length_map[total_length] = jagged_int_var
+
+    if total_length_map:
+        # there are explicit jagged Tensors among the inputs:
+        # we check if there are implict ones and make them explicit
+        for tensor in inputs:
+            shape = tensor._attrs["shape"]
+            if not tensor.is_jagged() and shape and not isinstance(shape[0], IntImm):
+                if shape[0] in total_length_map:
+                    # the dense Tensor input's first dimension is the total_length
+                    # dimension in the JaggedIntVar of one of the jagged Tensor
+                    # inputs: we replace the dense Tensor input's first dimension
+                    # by the corresponding JaggedIntVar, hence giving it a
+                    # jagged Tensor semantics for further processing.
+                    shape[0] = total_length_map[shape[0]]
+
+
+def _broadcast_dense_shapes(shapes: List[List[IntVar]]) -> List[IntVar]:
+    if len(shapes) == 1:
+        return list(shapes[0])
+
+    max_shape = None
+    for shape in shapes:
+        if max_shape is None:
+            max_shape = list(shape)
+        broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
+            max_shape, shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                "Input shapes of the elementwise op are not compatible! "
+                f"Shape1: {max_shape}, shape2: {shape}"
+            )
+        max_shape = new_max_shape
+
+    return max_shape
+
+
+def _broadcast_jagged_shapes(shapes: List[List[IntVar]]) -> List[IntVar]:
+    if len(shapes) == 1:
+        return list(shapes[0])
+
+    rank = len(shapes[0])
+    first_dim = shapes[0][0]
+    for shape in shapes[1:]:
+        other_first_dim = shape[0]
+        if other_first_dim != first_dim:
+            raise ValueError(
+                "All jagged inputs of an elementwise op must "
+                "have the same first dim (JaggedIntVar), but got "
+                f"{first_dim} != {other_first_dim}"
+            )
+        other_rank = len(shape)
+        if other_rank != rank:
+            raise ValueError(
+                "All jagged inputs of an elementwise op "
+                "must have the same rank, but got "
+                f"{rank} != {other_rank}"
+            )
+
+    suffix_shapes = [shape[1:] for shape in shapes]
+    max_suffix_shape = suffix_shapes[0]
+    for suffix_shape in suffix_shapes[1:]:
+        broadcastable, new_max_shape = shape_utils.get_broadcast_max_shape(
+            max_suffix_shape, suffix_shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                "Jagged input suffix shapes of the elementwise op are not compatible! "
+                f"Shape1: {max_suffix_shape}, shape2: {suffix_shape}"
+            )
+        max_suffix_shape = new_max_shape
+
+    return [first_dim] + max_suffix_shape
+
+
+def _broadcast_dense_and_jagged_shape(
+    dense_shape: List[IntVar],
+    jagged_shape: List[IntVar],
+) -> List[IntVar]:
+    jagged_first_dim = jagged_shape[0]
+    jagged_suffix_shape = jagged_shape[1:]
+    dense_suffix_shape = dense_shape[-len(jagged_suffix_shape) :]
+    broadcastable, max_suffix_shape = shape_utils.get_broadcast_max_shape(
+        jagged_suffix_shape, dense_suffix_shape
+    )
+    if not broadcastable:
+        raise ValueError(
+            "The suffix shapes of jagged and dense inputs of the elementwise op are not compatible! "
+            f"Jagged suffix shape: {jagged_suffix_shape}, dense suffix shape: {dense_suffix_shape}"
+        )
+
+    if len(dense_shape) >= len(jagged_shape):
+        dense_prefix_shape = dense_shape[: -len(dense_suffix_shape)]
+        jagged_max_dense_prefix_shape = jagged_first_dim.get_max_dense_shape()
+        if len(dense_prefix_shape) > len(jagged_max_dense_prefix_shape):
+            raise ValueError(
+                "The rank of dense inputs of an elementwise op can't be "
+                "higher than the rank of the jagged inputs (when treating "
+                "the jagged dims as separate dims)."
+            )
+
+        broadcastable, _ = shape_utils.get_broadcast_max_shape(
+            jagged_max_dense_prefix_shape, dense_prefix_shape
+        )
+        if not broadcastable:
+            raise ValueError(
+                f"JaggedIntVar of the jagged inputs ({jagged_first_dim}) is not compatible "
+                f"with the broadcasted prefix shape of the dense inputs ({dense_prefix_shape})."
+            )
+
+    return [jagged_first_dim] + max_suffix_shape
+
+
 class elementwise(Operator):
     """elementwise operator definition."""
 
@@ -57,44 +192,71 @@ def _infer_shapes(self, *args: Tensor) -> List[IntVar]:
             raise RuntimeError(
                 "Elementwise op {} doesn't have inputs!".format(self._attrs["func"])
             )
-        max_shape = None
-        for tensor in args:
-            shape = tensor._attrs["shape"]
-            if max_shape is None:
-                max_shape = list(shape)
-            broadcastable, max_shape = shape_utils.get_broadcast_max_shape(
-                max_shape, shape
-            )
-            if not broadcastable:
-                raise RuntimeError(
-                    "Tensor shapes of elementwise ops are not compatible! Shape1: {}, shape2: {}".format(
-                        max_shape, shape
-                    )
-                )
-        return max_shape
+
+        _discover_implicit_jagged_inputs(args)
+
+        dense_shapes = [arg._attrs["shape"] for arg in args if not arg.is_jagged()]
+        jagged_shapes = [arg._attrs["shape"] for arg in args if arg.is_jagged()]
+
+        max_dense_shape = _broadcast_dense_shapes(dense_shapes)
+        if not jagged_shapes:
+            return max_dense_shape
+
+        max_jagged_shape = _broadcast_jagged_shapes(jagged_shapes)
+        if not dense_shapes:
+            return max_jagged_shape
+
+        return _broadcast_dense_and_jagged_shape(max_dense_shape, max_jagged_shape)
 
     def __call__(self, *args: Tensor) -> Tensor:
         converted_args = []
+        symbolic_args = []
+        common_dtype = None
+        assert len(args) > 0, "Elementwise ops must take at least one argument."
         for arg in args:
             if isinstance(arg, int) or isinstance(arg, float):
                 converted_args.append(Tensor(shape=[], value=arg))
+                symbolic_args.append(arg)
+            elif isinstance(arg, IntVarTensor) and self._attrs["func"] == FuncEnum.SQRT:
+                assert len(arg._attrs["int_var"]._attrs["values"]) == 1
+                converted_args.append(
+                    Tensor(shape=[], value=arg._attrs["int_var"]._attrs["values"][0])
+                )
+                symbolic_args.append(arg._attrs["int_var"].symbolic_value())
             elif isinstance(arg, Tensor):
                 converted_args.append(arg)
+                if common_dtype is None:
+                    common_dtype = normalize_dtype(arg.dtype())
+                elif normalize_dtype(arg.dtype()) != common_dtype:
+                    raise NotImplementedError(
+                        f"Type promotions are not supported; got dtype {arg.dtype()}, but expected {common_dtype}"
+                    )
+                symbolic_args.append(arg._attrs.get("symbolic_value", None))
             else:
                 raise RuntimeError(
                     f"Unsupported data type {arg} in elementwise {self}!"
                 )
 
+        if common_dtype is None:
+            # All inputs were constants. Just use fp16
+            common_dtype = "float16"
+        else:
+            # Infer dtype for constant nums
+            for arg in converted_args:
+                if arg.is_a_const_num():
+                    arg._attrs["dtype"] = common_dtype
+
         self._attrs["args"] = list(converted_args)
         self._attrs["inputs"] = [
             arg for arg in converted_args if not arg.is_a_const_num()
         ]
         self._set_depth()
-        # for some reason aten converters fail if uncommented
-        # we will need this for fp32 support
-        # dtype = self._attrs["args"][0]._attrs["dtype"]
         output_shape = self._infer_shapes(*converted_args)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=common_dtype)
+        if self._attrs["func"] in INT_ELEMENTWISE_FUNC and None not in symbolic_args:
+            output._attrs["symbolic_value"] = functools.reduce(
+                INT_ELEMENTWISE_FUNC[self._attrs["func"]], symbolic_args
+            )
         self._attrs["outputs"] = [output]
         return output
 
@@ -109,7 +271,7 @@ def replace_input_tensor(self, old_tensor, new_tensor) -> None:
         ]
 
     def _args_for_pseudo_code(self):
-        return [self._attrs["func"]]
+        return [f"func={self._attrs['func']}"]
 
 
 # TODO: move it to math.py and update it to a function.
@@ -129,10 +291,6 @@ def __init__(self) -> None:
     def __call__(
         self, x: Tensor, min_value: Any = None, max_value: Any = None
     ) -> Tensor:
-        if isinstance(min_value, (int, float)):
-            min_value = Tensor(value=min_value, shape=[])
-        if isinstance(max_value, (int, float)):
-            max_value = Tensor(value=max_value, shape=[])
         if min_value is None and max_value is not None:
             return elementwise(FuncEnum.MIN)(
                 x,
diff --git a/python/aitemplate/compiler/ops/common/epilogue.py b/python/aitemplate/compiler/ops/common/epilogue.py
index cec01268b..fd684bf6e 100644
--- a/python/aitemplate/compiler/ops/common/epilogue.py
+++ b/python/aitemplate/compiler/ops/common/epilogue.py
@@ -61,3 +61,7 @@ class FuncEnum(Enum):
     GELU = 23
     FASTGELU = 24
     SOFTPLUS = 25
+    ELU = 26
+    SOFTSIGN = 27
+    FLOOR_DIV = 28
+    CELU = 29
diff --git a/python/aitemplate/compiler/ops/common/fused_elementwise.py b/python/aitemplate/compiler/ops/common/fused_elementwise.py
index 4ad1988e5..74afdff32 100644
--- a/python/aitemplate/compiler/ops/common/fused_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/fused_elementwise.py
@@ -15,13 +15,13 @@
 """
 Fused elementwise operator definition.
 """
-from typing import List
+from typing import List, Set
 
-from .... import backend
-from ....backend import registry
-from ...base import Operator
-from ...tensor_accessor import TensorAccessor
-from .elementwise import elementwise
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
+from aitemplate.compiler.ops.common.elementwise import elementwise
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0301,C0103,W0223
 
@@ -51,54 +51,15 @@ def _check_output_shape(self) -> None:
                     )
                 )
 
-    def _update_inputs_outputs(self) -> None:
-        ops = set(self._attrs["elementwise_ops"])
-        external_inputs = set()
-        external_outputs = set()
-        tmp_inputs = set()
-        tmp_outputs = set()
-
-        for op in ops:
-            for input_tensor in op._attrs["inputs"]:
-                tmp_inputs.add(input_tensor)
-                if (
-                    len(input_tensor._attrs["src_ops"]) == 0
-                    or len(set(input_tensor._attrs["src_ops"]) - ops) > 0
-                ) and (not input_tensor.is_a_const_num()):
-                    external_inputs.add(input_tensor)
-                assert op in input_tensor._attrs["dst_ops"]
-            for output_tensor in op._attrs["outputs"]:
-                tmp_outputs.add(output_tensor)
-                if (
-                    output_tensor._attrs["is_output"]
-                    or len(output_tensor._attrs["dst_ops"] - ops) > 0
-                ):
-                    external_outputs.add(output_tensor)
-                assert len(output_tensor._attrs["src_ops"]) == 1
-                assert list(output_tensor._attrs["src_ops"])[0] == op
-
-        assert (
-            external_inputs == tmp_inputs - tmp_outputs
-        ), "external_inputs: {} is not equal to tmp_inputs: {} - tmp_outputs: {}.".format(
-            external_inputs, tmp_inputs, tmp_outputs
-        )
-        assert (
-            len(tmp_outputs - tmp_inputs - external_outputs) == 0
-        ), "tmp_outputs: {} - tmp_inputs: {} - external_outputs: {} is not empty.".format(
-            tmp_outputs, tmp_inputs, external_outputs
-        )
-        assert (
-            len(external_outputs - tmp_outputs) == 0
-        ), "external_outputs: {} - tmp_outputs: {} is not empty.".format(
-            external_outputs, tmp_outputs
-        )
-
-        self._attrs["inputs"] = list(external_inputs)
+    def _update_inputs_outputs(
+        self, inputs: Set[Operator], outputs: Set[Operator]
+    ) -> None:
+        self._attrs["inputs"] = list(inputs)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
-        self._attrs["outputs"] = list(external_outputs)
+        self._attrs["outputs"] = list(outputs)
         self._attrs["output_accessors"] = [
             TensorAccessor(output_tensor) for output_tensor in self._attrs["outputs"]
         ]
@@ -109,12 +70,9 @@ def _update_inputs_outputs(self) -> None:
         self._attrs["original_inputs"] = list(self._attrs["inputs"])
         self._attrs["original_outputs"] = list(self._attrs["outputs"])
 
-        for tensor in tmp_inputs | tmp_outputs:
-            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - ops
-            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - ops
-        for tensor in external_inputs:
+        for tensor in inputs:
             tensor._attrs["dst_ops"].add(self)
-        for tensor in external_outputs:
+        for tensor in outputs:
             tensor._attrs["src_ops"].add(self)
 
     def _check_constant(self) -> None:
@@ -128,24 +86,33 @@ def _check_constant(self) -> None:
             f"Please use Python to calculate directly. Operator: {self}"
         )
 
-    def __init__(self, elementwise_ops: List[elementwise]) -> None:
+    def __init__(
+        self,
+        elementwise_ops: List[elementwise],
+        inputs: Set[Operator],
+        outputs: Set[Operator],
+    ) -> None:
         super().__init__()
 
         if len(elementwise_ops) == 0:
             raise RuntimeError(
                 "fused_elementwise argument elementwise_ops cannot be empty!"
             )
-
+        # It is required that elementwise_ops need to be topologically sorted.
         self._attrs["op"] = "fused_elementwise"
         self._attrs["elementwise_ops"] = elementwise_ops
         self._attrs["has_profiler"] = False
 
-        self._update_inputs_outputs()
+        self._update_inputs_outputs(inputs, outputs)
         self._set_depth()
         self._check_constant()
 
     def _get_op_attributes(self):
-        return {"elementwise_ops": self._attrs["elementwise_ops"]}
+        return {
+            "elementwise_ops": self._attrs["elementwise_ops"],
+            "inputs": self._attrs["inputs"],
+            "outputs": self._attrs["outputs"],
+        }
 
     def gen_function(self) -> str:
         target = backend.target.Target.current()
@@ -156,4 +123,4 @@ def gen_function(self) -> str:
         return func(self._attrs)
 
     def _args_for_pseudo_code(self):
-        return [op._attrs["func"] for op in self._attrs["elementwise_ops"]]
+        return [f"func={[op._attrs['func'] for op in self._attrs['elementwise_ops']]}"]
diff --git a/python/aitemplate/compiler/ops/common/int_elementwise.py b/python/aitemplate/compiler/ops/common/int_elementwise.py
index 1e8a4b8e0..c44db8b9c 100644
--- a/python/aitemplate/compiler/ops/common/int_elementwise.py
+++ b/python/aitemplate/compiler/ops/common/int_elementwise.py
@@ -18,17 +18,22 @@
 import functools
 from functools import reduce
 
-from .... import backend
-from ....backend import registry
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVarTensor, Operator, Tensor
+from aitemplate.compiler.op_registry import OP_REGISTRY
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 
-from ....utils import shape_utils
-from ...base import IntVarTensor, Operator, Tensor
-from ...op_registry import OP_REGISTRY
-from .epilogue import FuncEnum
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,C0301,W0223,R1724
 
-INT_ELEMENTWISE_FUNC_COVERAGE = [FuncEnum.MUL, FuncEnum.DIV, FuncEnum.SUB, FuncEnum.ADD]
+INT_ELEMENTWISE_FUNC = {
+    FuncEnum.MUL: lambda x, y: x * y,
+    FuncEnum.DIV: lambda x, y: x / y,
+    FuncEnum.SUB: lambda x, y: x - y,
+    FuncEnum.ADD: lambda x, y: x + y,
+}
 
 
 class int_elementwise(Operator):
@@ -43,7 +48,7 @@ def __init__(self, func_enum: FuncEnum) -> None:
 
         super().__init__()
         self._attrs["op"] = "int_elementwise"
-        if func_enum not in INT_ELEMENTWISE_FUNC_COVERAGE:
+        if func_enum not in INT_ELEMENTWISE_FUNC:
             raise RuntimeError(f"Not such FuncEnum {func_enum} in int_elementwise!")
         self._attrs["func"] = func_enum
         self._attrs["has_profiler"] = False
@@ -59,12 +64,15 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                 )
         max_vars = [max(v._attrs["values"]) for v in int_vars]
         min_vars = [min(v._attrs["values"]) for v in int_vars]
+        sym_vars = [v._attrs["symbolic_value"] for v in int_vars]
         assert len(max_vars) == len(min_vars) and len(max_vars) >= 2
         values = []
         if self._attrs["func"] == FuncEnum.MUL:
             values += [reduce(lambda x, y: x * y, lis) for lis in [min_vars, max_vars]]
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.MUL], sym_vars)
         elif self._attrs["func"] == FuncEnum.ADD:
             values += [reduce(lambda x, y: x + y, lis) for lis in [min_vars, max_vars]]
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.ADD], sym_vars)
         elif self._attrs["func"] == FuncEnum.SUB:
             inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
             # For an inputs of range [(4,9), (1,8)],
@@ -85,6 +93,7 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                         )
                     a = (lower_bound, upper_bound)
             values = list(a)
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.SUB], sym_vars)
         elif self._attrs["func"] == FuncEnum.DIV:  # floordiv
             inp_range = [(a, b) for a, b in zip(min_vars, max_vars)]
             # For an inputs of range [(4,9), (1,8)],
@@ -105,9 +114,10 @@ def __call__(self, *args: IntVarTensor) -> Tensor:
                         )
                     a = (lower_bound, upper_bound)
             values = list(a)
+            sym_values = reduce(INT_ELEMENTWISE_FUNC[FuncEnum.DIV], sym_vars)
         else:
             raise RuntimeError(f"Unsupported calculation type {self._attrs['func']}!")
-        dim = shape_utils.gen_int_var_min_max(values)
+        dim = shape_utils.gen_int_var_min_max(values, symbolic_value=sym_values)
         for arg, iv in zip(args, int_vars):
             arg._attrs["int_var"] = iv
             assert not arg.is_a_const_num(), f"{arg} cannot be constant"
@@ -122,7 +132,7 @@ def _get_op_attributes(self):
         return {"func_enum": self._attrs["func"]}
 
     def _args_for_pseudo_code(self):
-        return [self._attrs["func"]]
+        return [f"func={self._attrs['func']}"]
 
     def gen_function(self) -> str:
         target = backend.target.Target.current()
diff --git a/python/aitemplate/compiler/ops/common/math.py b/python/aitemplate/compiler/ops/common/math.py
index 8c6e76cac..4534628e6 100644
--- a/python/aitemplate/compiler/ops/common/math.py
+++ b/python/aitemplate/compiler/ops/common/math.py
@@ -67,8 +67,8 @@ def sigmoid(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("SIGMOID")(tensor)
 
 
-def leaky_relu(tensor: Any) -> Tensor:
-    return OP_REGISTRY.get("LRELU")(tensor)
+def leaky_relu(tensor: Any, negative_slope: Any) -> Tensor:
+    return OP_REGISTRY.get("LRELU")(tensor, negative_slope)
 
 
 def hardtanh(*args, **kwargs) -> Tensor:
@@ -101,3 +101,19 @@ def fast_gelu(tensor: Any) -> Tensor:
 
 def softplus(tensor: Any) -> Tensor:
     return OP_REGISTRY.get("SOFTPLUS")(tensor)
+
+
+def elu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("ELU")(tensor)
+
+
+def softsign(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("SOFTSIGN")(tensor)
+
+
+def floor_div(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("FLOOR_DIV")(tensor)
+
+
+def celu(tensor: Any) -> Tensor:
+    return OP_REGISTRY.get("CELU")(tensor)
diff --git a/python/aitemplate/compiler/ops/common/python_ops.py b/python/aitemplate/compiler/ops/common/python_ops.py
index b53d76050..98d8ba13c 100644
--- a/python/aitemplate/compiler/ops/common/python_ops.py
+++ b/python/aitemplate/compiler/ops/common/python_ops.py
@@ -17,8 +17,9 @@
 """
 from typing import Any, List, Tuple, Union
 
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221,R1732,W0613
 
diff --git a/python/aitemplate/compiler/ops/common/view_ops.py b/python/aitemplate/compiler/ops/common/view_ops.py
index 5fe2e564c..c5c174629 100644
--- a/python/aitemplate/compiler/ops/common/view_ops.py
+++ b/python/aitemplate/compiler/ops/common/view_ops.py
@@ -16,19 +16,34 @@
 View ops.
 """
 
-import itertools
 import logging
 import math
-from typing import Any, List, Optional, Tuple, Union
+from functools import reduce
+from typing import Any, List, Optional, Union
 
 import jinja2
 
 from aitemplate import backend
 from aitemplate.backend import registry
-from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
-from aitemplate.utils.shape_utils import convert_shape_to_IntVar
-
-from ....utils.tensor_utils import wrap_dim
+from aitemplate.compiler.base import (
+    IntImm,
+    IntVar,
+    IntVarTensor,
+    JaggedDim,
+    JaggedIntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.symbolic import (
+    get_global_symbol_set,
+    get_intvar,
+    is_integer,
+    is_symbol,
+    is_symbolic,
+    simplify_intvar_values,
+)
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar, gen_int_var_min_max
+from aitemplate.utils.tensor_utils import wrap_dim
 
 
 # SHAPE_ASSIGNMENT_TEMPLATE is folded in here
@@ -115,6 +130,30 @@ def __init__(self):
         super().__init__()
         self._attrs["unknown_idx"] = -1
 
+    def make_output_shape_from_int_vars(
+        self,
+        shape: List[Any],
+    ) -> List[IntVar]:
+        output_shape = []
+        for dim in shape:
+            int_var = dim._attrs["int_var"]
+            assert (
+                int_var is not None
+            ), f"expected an int_var dimension, but got {int_var=} for {shape=}"
+            dim_values = list(int_var._attrs["values"])
+            if len(dim_values) == 1:
+                output_shape.append(IntImm(dim_values[0]))
+            else:
+                # dynamic dimension
+                dim_name = int_var._attrs["name"]
+                var = IntVar(
+                    name=dim_name,
+                    values=dim_values,
+                    symbolic_value=int_var._attrs["symbolic_value"],
+                )
+                output_shape.append(var)
+        return output_shape
+
     def make_output_shape(
         self,
         y_shape_values: List[Union[List[int], int]],
@@ -129,12 +168,11 @@ def make_output_shape(
             if len(values) == 1:
                 output_shape.append(IntImm(values[0]))
             else:
-                if not is_intvar_tensor:
-                    assert (
-                        self._attrs["unknown_idx"] == -1
-                    ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
-                    "got {idx} and {self._attrs['unknown_idx']}"
-                    self._attrs["unknown_idx"] = idx
+                assert (
+                    self._attrs["unknown_idx"] == -1
+                ), f"{self._attrs['op']} doesn't support multiple dynamic dims, "
+                "got {idx} and {self._attrs['unknown_idx']}"
+                self._attrs["unknown_idx"] = idx
                 output_shape.append(
                     dynamic_dim if dynamic_dim is not None else IntVar(values=values)
                 )
@@ -159,6 +197,16 @@ def _is_dynamic_dim_reused(x_shape_values, y_shape_values) -> bool:
     )
 
 
+def _get_shape_values(symbolic_shape_values, shape_values):
+    new_shape_values = []
+    for sym, var in zip(symbolic_shape_values, shape_values):
+        if is_integer(sym):
+            new_shape_values.append([int(sym)])
+        else:
+            new_shape_values.append(var._attrs["values"])
+    return new_shape_values
+
+
 class reshape(_reshape_base):
     """
     Returns a tensor with the same data and number of elements as input, but with the
@@ -175,90 +223,138 @@ def __init__(self) -> None:
         self.shape_eval_template = RESHAPE_FUNC_TEMPLATE
         self.dynamic_eval_template = DYNAMIC_RESHAPE_FUNC_TEMPLATE
 
-    def _infer_shape(self, x: Tuple[int], shape: Tuple[int]):
-        new_shape = list(shape)
-        cur_shape = x
-        unknown_idx = -1
-        prod = 1
-        for idx, v in enumerate(new_shape):
-            if v == -1:
-                # no multiple -1s
-                assert unknown_idx == -1
-                unknown_idx = idx
-            else:
-                prod *= v
-        numel = 1
-        for dim in cur_shape:
-            numel *= dim
-
-        if unknown_idx == -1:
-            assert (
-                numel == prod
-            ), f"When there is no unknown index, we expect dim products to be equal, got current shape {numel=} != new shape {prod=}"
-        else:
-            # FIXME: note that this RuntimeError rules out some "valid" PyTorch
-            # code like:
-            # t = torch.arange(0).reshape(4, 0)
-            # this is valid in PT but would trigger RuntimeError below
-            # t.reshape(2, 2, -1)
-            # We can fix it later.
-            if prod <= 0:
-                raise RuntimeError(f"cannot reshape tensor {x} with shape {shape}")
-            assert numel % prod == 0
-            new_shape[unknown_idx] = numel // prod
-        return new_shape
-
     def _infer_shapes(self, x: Tensor):
         # There are two cases:
         # 1) there is only one unknown shape.
         # 2) there is no unkown shape and all shape dimensions are represented as IntVarTensor
-        # For 1), the view op will deduce the shape of if one dim is labeled as -1,
+        # For 1), the view op will deduce the shape of the dim that is labeled as -1,
         #         but it can't do so with more than 1 dynamic dimension
         # For 2), when all dynamic shapes are known, we should be able to pass the input shape to out.
         #         i.e. we should skip the deduction when all shapes are known.
         is_intvar = all([isinstance(var, IntVarTensor) for var in self._attrs["shape"]])
         self._attrs["is_intvar"] = is_intvar
+
         if not is_intvar:
-            x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-            x_dynamic_dims = [
-                var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
+            # x_symbolic_shapes is a list of symbolic_values
+            x_symbolic_shapes = [
+                var._attrs["symbolic_value"] for var in x._attrs["shape"]
+            ]
+            x_symbolic_shapes_mapping = {
+                var._attrs["symbolic_value"]: var for var in x._attrs["shape"]
+            }
+            # x_shape_values is a list of valid IntVar _attrs["values"]
+            x_shape_values = _get_shape_values(x_symbolic_shapes, x._attrs["shape"])
+            # x_shape_symbolic_values is a list of valid _attrs["symbolic_values"]
+            x_shape_symbolic_values = [
+                shape_values[0] if len(shape_values) < 2 else sym
+                for sym, shape_values in zip(x_symbolic_shapes, x_shape_values)
             ]
-            x_shapes = list(itertools.product(*x_shape_values))
 
             self._attrs["shape"] = convert_shape_to_IntVar(self._attrs["shape"])
-            new_shape_vals = [var._attrs["values"] for var in self._attrs["shape"]]
-            new_shapes = list(itertools.product(*new_shape_vals))
-
-            # len(x_shapes) > 1 means that at least 1 dim in the shapes of x is dynamic.
-            # len(new_shapes) > 1 means that the dynamic dim is retained; otherwise, it would
-            # have been replaced with -1 or a concrete number.
-            if len(x_shapes) > len(new_shapes):
-                # we only support two cases here, when len(x_shapes) > 1, len(x_shapes) must
-                # be either len(new_shapes) (the dynamic dim is retained) or 1 (use -1 to
-                # mark the dynamic or unknown index and no other dim is dynamic).
-                assert len(new_shapes) == 1
-                new_shapes = new_shapes * len(x_shapes)
-            # run infershape for each
-            y_shapes = [
-                self._infer_shape(x_shape, new_shape)
-                for x_shape, new_shape in zip(x_shapes, new_shapes)
+            to_symbolic_shapes = [
+                var._attrs["symbolic_value"] for var in self._attrs["shape"]
+            ]
+            # new_shape_values is a list of valid IntVar _attrs["values"] with the
+            # only exception being it including an -1.
+            new_shape_values = _get_shape_values(
+                to_symbolic_shapes, self._attrs["shape"]
+            )
+            new_shape_symbolic_values = [
+                shape_values[0] if len(shape_values) < 2 else sym
+                for sym, shape_values in zip(to_symbolic_shapes, new_shape_values)
             ]
 
-            def unique(vector):
-                return sorted(set(vector))
+            # Check whether we have -1 that needs to be deduced
+            neg_dim = None
+            for idx, s in enumerate(new_shape_values):
+                if len(s) == 1 and s[0] == -1:
+                    assert neg_dim is None, "Multiple -1 detected in reshape"
+                    neg_dim = idx
 
-            y_shape_values = list(map(unique, zip(*y_shapes)))
-            reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
-            return self.make_output_shape(
-                y_shape_values,
-                dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
+            x_prod = reduce(
+                lambda x, y: x * y, [val for val in x_shape_symbolic_values if val != 0]
+            )
+            new_prod = reduce(
+                lambda x, y: x * y,
+                [val for val in new_shape_symbolic_values if val != 0],
             )
+            quotient = x_prod / new_prod
+            if neg_dim is not None and is_integer(quotient):
+                # We check whether the negative -1 is static.
+                val = int(quotient * -1)
+                new_shape_symbolic_values[neg_dim] = val
+                self._attrs["shape"][neg_dim] = IntImm(val)
+                neg_dim = None
+
+            if neg_dim is None:
+                # We try to simplify symbols before returning the shapes.
+                symbol_idx = [
+                    idx
+                    for idx, s in enumerate(new_shape_symbolic_values)
+                    if is_symbolic(s)
+                ]
+
+                if len(symbol_idx) == 1:
+                    # Check if we can reuse shapes and if the shape belongs to
+                    # unknown_idx and need to be determined during runtime.
+                    new_prod = 1
+                    for idx, val in enumerate(new_shape_symbolic_values):
+                        if idx == symbol_idx[0]:
+                            continue
+                        if val != 0:
+                            new_prod *= val
+                    dynamic_symbol = x_prod / new_prod
+                    if is_symbol(dynamic_symbol):
+                        self._attrs["shape"][symbol_idx[0]] = get_intvar(
+                            dynamic_symbol.name
+                        )
+                    elif is_integer(dynamic_symbol):
+                        self._attrs["shape"][symbol_idx[0]] = IntImm(
+                            int(dynamic_symbol)
+                        )
+                    else:
+                        self._attrs["unknown_idx"] = symbol_idx[0]
+                # TODO: Handle len(symbol_idx) > 1 with recording previous symbols.
+
+                return self._attrs["shape"]
+            else:
+                # We try to deduce the dynamic dimensions for new_shapes.
+                self._attrs["unknown_idx"] = neg_dim
+
+                y_shapes = []
+                for idx, val in enumerate(new_shape_symbolic_values):
+                    if idx == self._attrs["unknown_idx"]:
+                        dynamic_symbol = x_prod / new_prod * -1
+                        if is_symbol(dynamic_symbol):
+                            y_shapes.append(get_intvar(dynamic_symbol.name))
+                        elif is_integer(dynamic_symbol):
+                            y_shapes.append(IntImm(int(dynamic_symbol)))
+                        else:
+                            symbol_names = {s.name for s in dynamic_symbol.free_symbols}
+                            unknown_symbols = symbol_names - get_global_symbol_set()
+                            assert (
+                                not unknown_symbols
+                            ), f"Unable to deduce dynamic symbol, because the following symbols are not in global symbol set: {unknown_symbols}"
+
+                            values = simplify_intvar_values(dynamic_symbol)
+                            new_var = IntVar(values, symbolic_value=dynamic_symbol)
+
+                            y_shapes.append(new_var)
+                    elif isinstance(val, int):
+                        y_shapes.append(IntImm(val))
+                    elif val in x_symbolic_shapes_mapping:
+                        y_shapes.append(x_symbolic_shapes_mapping[val])
+                    elif is_symbolic(val):
+                        val_var = gen_int_var_min_max(
+                            new_shape_values[idx], symbolic_value=val
+                        )
+                        y_shapes.append(val_var)
+                    else:
+                        raise ValueError(f"Unknown sym type for handling {val}")
+            return y_shapes
+
         else:
-            new_shape_vals = [
-                shape._attrs["int_var"]._attrs["values"]
-                for shape in self._attrs["shape"]
-            ]
-            return self.make_output_shape(new_shape_vals, is_intvar_tensor=True)
+            return self.make_output_shape_from_int_vars(self._attrs["shape"])
 
     def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
         self._attrs["shape"] = shape
@@ -269,7 +365,9 @@ def __call__(self, x: Tensor, shape: List[Any]) -> Tensor:
                 self._attrs["inputs"].append(s)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        output = Tensor(
+            output_shape, src_ops={self}, is_view_of=x, dtype=x._attrs["dtype"]
+        )
         self._attrs["outputs"] = [output]
         return output
 
@@ -310,46 +408,42 @@ def __init__(self, start_dim=0, end_dim=-1) -> None:
         self._attrs["start"] = start_dim
         self._attrs["end"] = end_dim
 
-    def _infer_shape(self, x: List[int]):
-        start = self._attrs["start"]
-        end = self._attrs["end"]
-
-        start = wrap_dim(start, len(x))
-        end = wrap_dim(end, len(x))
-
-        new_shape = []
-        for idx in range(start):
-            new_shape.append(x[idx])
-
-        prod = 1
-        for dim in x[start : end + 1]:
-            prod *= dim
-        new_shape.append(prod)
+    def _infer_shapes(self, x: Tensor):
+        # x_symbolic_shapes is a list of symbolic_values
+        x_symbolic_shapes = [var._attrs["symbolic_value"] for var in x._attrs["shape"]]
+        # x_shape_values is a list of valid IntVar _attrs["values"]
+        x_shape_values = _get_shape_values(x_symbolic_shapes, x._attrs["shape"])
+        # x_shape_symbolic_values is a list of valid _attrs["symbolic_values"]
+        x_shape_symbolic_values = [
+            shape_values[0] if len(shape_values) < 2 else sym
+            for sym, shape_values in zip(x_symbolic_shapes, x_shape_values)
+        ]
 
-        for dim in x[end + 1 :]:
-            new_shape.append(dim)
+        start = wrap_dim(self._attrs["start"], len(x_symbolic_shapes))
+        end = wrap_dim(self._attrs["end"], len(x_symbolic_shapes))
+        self._attrs["unknown_idx"] = start
 
-        return new_shape
+        # Computed shape after flatten.
+        new_shapes = []
 
-    def _infer_shapes(self, x: Tensor):
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        x_dynamic_dims = [
-            var for var in x._attrs["shape"] if 1 < len(var._attrs["values"])
-        ]
+        for var in x._attrs["shape"][:start]:
+            new_shapes.append(var)
 
-        # run infershape for each
-        y_shapes = [self._infer_shape(x_shape) for x_shape in x_shapes]
+        min_val, max_val, sym_val = 1, 1, 1
+        for idx in range(start, end + 1):
+            min_val *= min(x_shape_values[idx])
+            max_val *= max(x_shape_values[idx])
+            sym_val *= x_shape_symbolic_values[idx]
+        if min_val == max_val:
+            flatten_shape = IntImm(value=min_val)
+        else:
+            flatten_shape = IntVar(values=[min_val, max_val], symbolic_value=sym_val)
+        new_shapes.append(flatten_shape)
 
-        def unique(vector):
-            return sorted(set(vector))
+        for var in x._attrs["shape"][end + 1 :]:
+            new_shapes.append(var)
 
-        y_shape_values = list(map(unique, zip(*y_shapes)))
-        reuse_dynamic_dim = _is_dynamic_dim_reused(x_shape_values, y_shape_values)
-        return self.make_output_shape(
-            y_shape_values,
-            dynamic_dim=x_dynamic_dims[0] if reuse_dynamic_dim else None,
-        )
+        return new_shapes
 
     def _sanity_check(self, x_shape):
         x_rank = len(x_shape)
@@ -469,7 +563,9 @@ def __call__(self, x: Tensor) -> Tensor:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self}, is_view_of=x)
+        output = Tensor(
+            output_shape, src_ops={self}, is_view_of=x, dtype=x._attrs["dtype"]
+        )
         self._attrs["outputs"] = [output]
         return output
 
@@ -530,3 +626,201 @@ def _infer_shapes(self, x: Tensor) -> List[IntVar]:
 
         self._attrs["out_dim_to_in"] = out_dim_to_in
         return y_shapes
+
+
+class make_jagged(_view):
+    """
+    Creates jagged Tensors from normal Tensors, offsets, and metadata.
+
+    Jagged Tensors are normal Tensors with the first dynamic dimensions
+    represented with a JaggedIntVar instance (as opposed to a vanilla
+    IntVar). The purpose of this op is to take a normal AIT Tensor "source"
+    that contains the jagged Tensor's data and return a jagged Tensor with
+    the same data as source (with the is_view_of attribute set to source)
+    and the first dimension set to a JaggedIntVar. The jagged Tensor resulting
+    from this op can then be treated as jagged by other ops aware of the
+    jagged Tensor semantics (e.g., elementwise). Importantly, the source
+    Tensor is not sufficient for that, as it doesn't carry the necessary
+    jagged Tensor metadata (which the jagged Tensor does, in the first
+    JaggedIntVar dimension of its shape).
+
+    *Important*: this op is the only right way to create a jagged Tensor.
+    The reason is that the offsets Tensors passed to this op get registered
+    in the graph and, as a result, can't be optimized out. This wouldn't
+    be the case if the jagged Tensor would be "constructed" manually.
+
+    See the docstring of the JaggedIntVar class for more details on the
+    jagged Tensor semantics and representation.
+
+    In the backend, the purpose of the make_jagged op is to setup the
+    unified offsets representation for the jagged Tensor and to check
+    the contents of the rank-1 offsets Tensors for consistency.
+
+    __init__ Args:
+        batch_dim : IntVar
+            The batch dimension of the jagged Tensor.
+            Importantly, this is different from the first dimension of the
+            soruce Tensor, as it logically represents the number of variable-
+            length sequences encoded by the JaggedIntVar. I.e., the batch_dim
+            is B in the sum_B(N_B) representation of the JaggedIntVar.
+        jagged_dims : List[JaggedDim]
+            The list of jagged dimensions encoded in the JaggedIntVar of the
+            resulting jagged Tensor. See the JaggedDim and JaggedIntVar class
+            docstrings for the details.
+
+    __call__ Args:
+        source : Union[Tensor, List[Tensor]]
+            One or more source Tensors of the jagged Tensor(s) created by this op.
+            The jagged Tensor is a view of the source Tensor. The main difference
+            is that the resulting jagged Tensor's first dimension is set to a
+            JaggedIntVar, constructed from the batch_dim, jagged_dims, and the
+            offsets_list. The same JaggedIntVar instance is set as the first
+            dimension of every resulting jagged Tensor: one for each source
+            Tensor in the `source`.
+        offsets_list : List[Tensor]
+            The list of rank-1 offsets Tensors describing the variable-length
+            layout of each of the jagged_dims. There must be exactly as many
+            offsets Tensors in the offsets_list as there are JaggedDims in
+            the jagged_dims list. Each offsets Tensor is associated with the
+            corresponding JaggedDim before constructing a JaggedIntVar from
+            them for the resulting jagged Tensor.
+
+    Returns:
+        Union[Tensor, List[Tensor]]
+            The resulting jagged Tensor or a list thereof, depending on whether
+            the `source` argument is a Tensor or a List[Tensor].
+    """
+
+    def __init__(
+        self,
+        batch_dim: IntVar,
+        jagged_dims: List[JaggedDim],
+        check_sequence_lengths: bool = True,
+    ) -> None:
+        if not jagged_dims or not all(
+            isinstance(dim, JaggedDim) for dim in jagged_dims
+        ):
+            raise TypeError(
+                "jagged_dim must be a non-empty list of JaggedDims, "
+                f"but given {jagged_dims}."
+            )
+
+        super().__init__()
+
+        self._attrs["op"] = "make_jagged"
+        self._attrs["batch_dim"] = batch_dim
+        self._attrs["jagged_dims"] = list(jagged_dims)
+        self._attrs["check_sequence_lengths"] = check_sequence_lengths
+
+    def _set_jagged_dim_offsets(self, offsets_list: List[Tensor]):
+        jagged_dims = self._attrs["jagged_dims"]
+        for i, (jagged_dim, offsets) in enumerate(zip(jagged_dims, offsets_list)):
+            if jagged_dim.offsets() is not None:
+                if jagged_dim.offsets() == offsets:
+                    continue
+                else:
+                    raise ValueError(
+                        f"JaggedDim {i} in the jagged_dims already has associated "
+                        "offsets != the offsets passed to the make_jagged.__call__."
+                    )
+            jagged_dim._attrs["offsets"] = offsets
+
+    def __call__(
+        self,
+        source: Union[Tensor, List[Tensor]],
+        offsets_list: List[Tensor],
+    ) -> Tensor:
+        sources_list = [source] if isinstance(source, Tensor) else source
+
+        if not sources_list:
+            raise ValueError("There must be at least one source Tensor in the list.")
+
+        for s in sources_list:
+            if len(s._attrs["shape"]) == 0:
+                raise ValueError(
+                    "The source Tensors must be at least rank-1, but given rank-0."
+                )
+            if type(s._attrs["shape"][0]) != IntVar:
+                raise ValueError(
+                    "The source Tensor's first dim (total_length) must be "
+                    f"dynamic (IntVar), but given {s._attrs['shape']=}."
+                )
+
+        total_length = sources_list[0]._attrs["shape"][0]
+        for s in sources_list[1:]:
+            if s._attrs["shape"][0] != total_length:
+                raise ValueError(
+                    "All source Tensors must have the same first (total_length) dimension, "
+                    f"but got {s[0]._attrs['shape']=}, {s._attrs['shape']=}."
+                )
+
+        if isinstance(total_length, JaggedIntVar):
+            # already jagged Tensors
+            return source
+
+        jagged_dims = self._attrs["jagged_dims"]
+        if len(offsets_list) != len(jagged_dims):
+            raise ValueError(
+                f"{len(offsets_list)=} must be equal to {len(jagged_dims)=}"
+            )
+        for offsets in offsets_list:
+            if len(offsets._attrs["shape"]) != 1:
+                raise ValueError(
+                    "The offsets Tensors must be rank-1, "
+                    f"but given shape {offsets._attrs['shape']}."
+                )
+            if offsets._attrs["dtype"] not in ["int32", "int64"]:
+                raise TypeError(
+                    "The offsets Tensors can be either int32 or int64, "
+                    f"but given the Tensor of type {offsets._attrs['dtype']}."
+                )
+
+        self._attrs["num_sources"] = len(sources_list)
+        self._attrs["inputs"] = sources_list + offsets_list
+        self._set_depth()
+        self._set_jagged_dim_offsets(offsets_list)
+
+        jagged_int_var = JaggedIntVar(
+            batch_dim=self._attrs["batch_dim"],
+            jagged_dims=self._attrs["jagged_dims"],
+            total_length=total_length,
+        )
+
+        outputs = [
+            Tensor(
+                shape=[jagged_int_var] + s._attrs["shape"][1:],
+                src_ops={self},
+                is_view_of=s,
+            )
+            for s in sources_list
+        ]
+        self._attrs["outputs"] = outputs
+        if isinstance(source, Tensor):
+            outputs = outputs[0]
+
+        return outputs
+
+    def _get_op_attributes(self):
+        return {
+            "batch_dim": self._attrs["batch_dim"],
+            "jagged_dims": self._attrs["jagged_dims"],
+            "check_sequence_lengths": self._attrs["check_sequence_lengths"],
+        }
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        batch_dim = self._attrs["batch_dim"].pseudo_code()
+        jagged_dims = ", ".join(
+            [dim.pseudo_code() for dim in self._attrs["jagged_dims"]]
+        )
+        return [
+            f"batch_dim={batch_dim}",
+            f"jagged_dims={jagged_dims}",
+        ]
diff --git a/python/aitemplate/compiler/ops/conv/__init__.py b/python/aitemplate/compiler/ops/conv/__init__.py
index 1233111c6..2a49f29c9 100644
--- a/python/aitemplate/compiler/ops/conv/__init__.py
+++ b/python/aitemplate/compiler/ops/conv/__init__.py
@@ -16,21 +16,32 @@
 """
 Conv2d family operators.
 """
-from .conv2d import conv2d
-from .conv2d_bias import conv2d_bias
-from .conv2d_bias_add import conv2d_bias_add
-from .conv2d_bias_add_hardswish import conv2d_bias_add_hardswish
-from .conv2d_bias_add_relu import conv2d_bias_add_relu
-from .conv2d_bias_few_channels import conv2d_bias_few_channels
-from .conv2d_bias_hardswish import conv2d_bias_hardswish
-from .conv2d_bias_hardswish_few_channels import conv2d_bias_hardswish_few_channels
-from .conv2d_bias_relu import conv2d_bias_relu
-from .conv2d_bias_relu_few_channels import conv2d_bias_relu_few_channels
-from .conv2d_bias_sigmoid import conv2d_bias_sigmoid
-from .conv2d_depthwise import conv2d_depthwise
-from .conv2d_depthwise_bias import conv2d_depthwise_bias
-from .conv3d import conv3d
-from .depthwise_conv3d import depthwise_conv3d
-from .transposed_conv2d import transposed_conv2d
-from .transposed_conv2d_bias import transposed_conv2d_bias
-from .transposed_conv2d_bias_relu import transposed_conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
+from aitemplate.compiler.ops.conv.conv2d_bias_add import conv2d_bias_add
+from aitemplate.compiler.ops.conv.conv2d_bias_add_hardswish import (
+    conv2d_bias_add_hardswish,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_add_relu import conv2d_bias_add_relu
+from aitemplate.compiler.ops.conv.conv2d_bias_few_channels import (
+    conv2d_bias_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_hardswish import conv2d_bias_hardswish
+from aitemplate.compiler.ops.conv.conv2d_bias_hardswish_few_channels import (
+    conv2d_bias_hardswish_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv2d_bias_relu_few_channels import (
+    conv2d_bias_relu_few_channels,
+)
+from aitemplate.compiler.ops.conv.conv2d_bias_sigmoid import conv2d_bias_sigmoid
+from aitemplate.compiler.ops.conv.conv2d_depthwise import conv2d_depthwise
+from aitemplate.compiler.ops.conv.conv2d_depthwise_bias import conv2d_depthwise_bias
+from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
+from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias_relu import (
+    transposed_conv2d_bias_relu,
+)
diff --git a/python/aitemplate/compiler/ops/conv/cache_entry.py b/python/aitemplate/compiler/ops/conv/cache_entry.py
index 5f08fe215..efe4b58e0 100644
--- a/python/aitemplate/compiler/ops/conv/cache_entry.py
+++ b/python/aitemplate/compiler/ops/conv/cache_entry.py
@@ -34,9 +34,12 @@ class ConvQueryEntry:
     kh: int
     kw: int
     co: int
-    stride: int
-    pad: int
-    dilate: int
+    strideh: int
+    stridew: int
+    padh: int
+    padw: int
+    dilateh: int
+    dilatew: int
     op_type: str
     device: str
     epilogue: int
@@ -60,9 +63,12 @@ class ConvRecordEntry:
     kh: int
     kw: int
     co: int
-    stride: int
-    pad: int
-    dilate: int
+    strideh: int
+    stridew: int
+    padh: int
+    padw: int
+    dilateh: int
+    dilatew: int
     op_type: str
     epilogue: int
     device: str
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
index 5c8c8537c..ce2024559 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_activation.py
@@ -17,8 +17,9 @@
 """
 from typing import Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+
 
 # pylint: disable=C0103
 class conv2d_bias_activation(conv2d):
@@ -65,7 +66,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
index 6a38e3e4c..1395f50ff 100644
--- a/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
+++ b/python/aitemplate/compiler/ops/conv/common_conv2d_bias_add_activation.py
@@ -16,8 +16,8 @@
 Fused conv2d_bias_add_activation op.
 """
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
 
 
 # pylint: disable=C0103
@@ -67,7 +67,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor, r: Tensor):
         self._attrs["inputs"] = [x, w, b, r]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/conv2d.py b/python/aitemplate/compiler/ops/conv/conv2d.py
index 2c1626b90..68238e560 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d.py
@@ -16,24 +16,39 @@
 Base class for conv2d.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
 from hashlib import sha1
-from typing import Any, Dict, List
+from operator import itemgetter
+from typing import Any, Dict, List, Tuple, Union
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import logger, shape_utils
-from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
-from .cache_entry import ConvQueryEntry, ConvRecordEntry
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.conv.cache_entry import ConvQueryEntry, ConvRecordEntry
+from aitemplate.compiler.ops.conv.conv_common import (
+    filter_op_instances,
+    generate_profiler_sources,
+    get_profiler_filename,
+)
+from aitemplate.utils import alignment, environ, shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
 
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{dtype}}NI = {{x_dim0}};
@@ -43,12 +58,12 @@
 {{indent}}{{dtype}}CO = {{w_dim0}};
 {{indent}}{{dtype}}KH = {{w_dim1}};
 {{indent}}{{dtype}}KW = {{w_dim2}};
-{{indent}}{{dtype}}SH = {{stride}};
-{{indent}}{{dtype}}SW = {{stride}};
-{{indent}}{{dtype}}DH = {{dilate}};
-{{indent}}{{dtype}}DW = {{dilate}};
-{{indent}}{{dtype}}PH = {{pad}};
-{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}SH = {{strideh}};
+{{indent}}{{dtype}}SW = {{stridew}};
+{{indent}}{{dtype}}DH = {{dilateh}};
+{{indent}}{{dtype}}DW = {{dilatew}};
+{{indent}}{{dtype}}PH = {{padh}};
+{{indent}}{{dtype}}PW = {{padw}};
 {{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
 {{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
 {{indent}}{{dtype}}NO = NI;
@@ -74,7 +89,10 @@
 
 EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
     """
-NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && HI == {{x_dim1}} && WI == {{x_dim2}} && CI == {{x_dim3}}
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} &&
+ HI >= {{x_dim1_lb}} && HI <= {{x_dim1_ub}} &&
+ WI >= {{x_dim2_lb}} && WI <= {{x_dim2_ub}} &&
+ CI == {{x_dim3}}
 """
 )
 
@@ -148,12 +166,18 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
 
         Parameters
         ----------
-        stride : int
-            Stride of the convolution
-        pad : int
-            Size of padding to add to the input
-        dilate : int, optional
-            Size of spacing between kernel elements, by default 1
+        stride : int or tuple of two ints
+            Stride of the convolution. If tuple is
+            provided, the elements correspond to height and width stride
+            respectively
+        pad : int or tuple of two ints
+            Size of padding to add to the input. If tuple is
+            provided, the elements correspond to height and width padding
+            respectively
+        dilate : int or tuple of two ints, optional
+            Size of spacing between kernel elements, by default 1. If tuple is
+            provided, the elements correspond to height and width dilation
+            respectively
         group : int, optional
            Number of blocked connections from input
             channels to output channels, by default 1
@@ -175,16 +199,31 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self.exec_dyn_key_template = EXEC_DYN_KEY_TEMPLATE
         self.exec_cond_template = EXEC_COND_TEMPLATE
 
+    def _get_params_factory(self):
+        params_factory = {}
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        params_factory["strideh"], params_factory["stridew"] = _maybe_int_to_tuple(
+            self._attrs["stride"],
+            "Stride",
+        )
+        params_factory["padh"], params_factory["padw"] = _maybe_int_to_tuple(
+            self._attrs["pad"],
+            "Pad",
+        )
+        params_factory["dilateh"], params_factory["dilatew"] = _maybe_int_to_tuple(
+            self._attrs["dilate"],
+            "Dilation",
+        )
+        return params_factory
+
     def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
         if x[3] != w[3] * self._attrs["group"]:
             raise RuntimeError("X/W Shape mismatch for conv2d")
+
         eval_func = self.shape_eval_template.render(
             indent="",
             dtype="",
             div="//",
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             x_dim0=x[0],
             x_dim1=x[1],
             x_dim2=x[2],
@@ -192,6 +231,7 @@ def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
             w_dim0=w[0],
             w_dim1=w[1],
             w_dim2=w[2],
+            **self._get_params_factory(),
         )
         output = {}
         exec(eval_func, output)  # noqa: P204
@@ -219,11 +259,26 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+
+        # Ensure convolutional parameters are in form (val_h, val_w)
+        dilate_h, dilate_w = _maybe_int_to_tuple(self._attrs["dilate"], "Dilation")
+        stride_h, stride_w = _maybe_int_to_tuple(self._attrs["stride"], "Stride")
+        pad_h, pad_w = _maybe_int_to_tuple(self._attrs["pad"], "Pad")
+        KHEff = (w_shape[1] - 1) * dilate_h + 1
+        KWEff = (w_shape[2] - 1) * dilate_w + 1
+        out_h = (in_h + 2 * pad_h - KHEff) // stride_h + 1
+        out_w = (in_w + 2 * pad_w - KWEff) // stride_w + 1
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
@@ -235,40 +290,43 @@ def _gen_exec_key(self, shape: List[int]):
             x_dim0=shape[0], x_dim1=shape[1], x_dim2=shape[2], x_dim3=shape[3]
         ).replace("\n", "")
 
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+    def _gen_dyn_exec_key(
+        self, dim0_lb, dim0_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
+    ):
         return self.exec_dyn_key_template.render(
-            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+            x_dim0_lb=dim0_lb,
+            x_dim0_ub=dim0_ub,
+            x_dim1_lb=dim1_lb,
+            x_dim1_ub=dim1_ub,
+            x_dim2_lb=dim2_lb,
+            x_dim2_ub=dim2_ub,
+            x_dim3=dim3,
         ).replace("\n", "")
 
     def _extract_exec_path(self, x: Tensor):
         x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        # FIXME: we take the max height and weight for profiling at the moment.
+        # Let's figure out a better profiling strategy later.
+        # The following attribute is temporarily used to hold the lower bounds of
+        # all dimensions. We will remove them later once we have a better profiling
+        # strategy.
+        self._attrs["dim_lower_bounds"] = [min(vals) for vals in x_shape_values]
+        x_shape_values = [x_shape_values[0]] + [[max(vs)] for vs in x_shape_values[1:]]
+
         x_shapes = itertools.product(*x_shape_values)
         self._attrs["exec_path"] = OrderedDict()
         for x_shape in x_shapes:
             key = self._gen_exec_key(x_shape)
             self._attrs["exec_path"][key] = ""
 
-    def _signature(self):
-        signature = "conv2d: K=[{kh}, {kw}], S=[{s}], P=[{p}], CO=[{co}]".format(
-            kh=self._attrs["KH"],
-            kw=self._attrs["KW"],
-            s=self._attrs["stride"],
-            p=self._attrs["pad"],
-            co=self._attrs["CO"],
-        )
-        return signature
-
     def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         epilogue_dim = output_shape[-1]
         if not isinstance(epilogue_dim, IntImm):
             raise RuntimeError("Conv output last dimension must be static!")
-        shape = epilogue_dim._attrs["values"][0]
-        if shape % 8 == 0:
-            self._attrs["epilogue_alignment"] = 8
-        elif shape % 4 == 0:
-            self._attrs["epilogue_alignment"] = 4
-        elif shape % 2 == 0:
-            self._attrs["epilogue_alignment"] = 2
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
+            number=epilogue_dim._attrs["values"][0],
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
 
     def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         """Call conv2d with tensors x, w
@@ -290,7 +348,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -304,6 +362,73 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         return attr
 
+    def _should_build_profiler(self) -> bool:
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this conv instance, we update this conv op's
+        relevant attributes with the cached result and return False.
+        """
+        force_cache = environ.force_profiler_cache()
+        if self._has_dynamic_input_dims():
+            if force_cache:
+                raise RuntimeError(
+                    "We cannot force to use the cache as dynamic dims require "
+                    "us to generate and build the profilers"
+                )
+            # If there are dynamic dims, we'll have to generate and build the
+            # profilers, as the binaries will be needed for dynamic profiling.
+            return True
+        # We are forced to use the cache so we skip building profilers.
+        if force_cache:
+            return False
+
+        target = backend.target.Target.current()
+        workloads = list(self._attrs["exec_path"].keys())
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(self._attrs["op_instance"].keys()))
+            tmp_op = self._attrs["op_instance"][tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                split_k = (
+                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+                )
+                query = ConvQueryEntry(
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.accumulator_type().value - 1,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    kh=self._attrs["KH"],
+                    kw=self._attrs["KW"],
+                    co=self._attrs["CO"],
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    split_k=split_k,
+                    exec_entry_sha1=exec_entry_sha1,
+                    **self._get_params_factory(),
+                )
+                cache_value = target.query_profile_cache("conv", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    _LOGGER.info(
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace = cache_value
+                    self._attrs["exec_path"][wkl] = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
     def gen_profiler(
         self,
         workdir: str = None,
@@ -325,17 +450,28 @@ def gen_profiler(
         )
         func = registry.get(func_key)
         func(self._attrs)
-        func_key = "{target}.{op}.gen_profiler".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        return func(self._attrs, workdir, self.shape_eval_template)
+
+        if self._should_build_profiler():
+            x_shapes = [
+                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
+            ]
+            self._attrs["op_instance"] = filter_op_instances(
+                func_attrs=self._attrs,
+                x_shapes=x_shapes,
+            )
+            return generate_profiler_sources(
+                func_attrs=self._attrs,
+                op_class="conv",
+                workdir=workdir,
+                shape_template=self.shape_eval_template,
+            )
 
     def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         exe_path = os.path.join(profiler_prefix, cfg)
         if not os.access(exe_path, os.X_OK):
             raise RuntimeError("Profiler %s is not executable" % exe_path)
         cmd = [exe_path]
+        params = self._get_params_factory()
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         cmd.append(x_shape[2])
@@ -343,109 +479,122 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(self._attrs["KH"])
         cmd.append(self._attrs["KW"])
         cmd.append(self._attrs["CO"])
-        cmd.append(self._attrs["stride"])
-        cmd.append(self._attrs["pad"])
-        cmd.append(self._attrs["dilate"])
+        cmd.append(params["strideh"])
+        cmd.append(params["padh"])
+        cmd.append(params["dilateh"])
+        cmd.append(params["stridew"])
+        cmd.append(params["padw"])
+        cmd.append(params["dilatew"])
         cmd.append(self._attrs["group"])
         command = [str(x) for x in cmd]
         return command
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
         target = backend.target.Target.current()
-        # if in CI just choose minimal configs
-        # workspace is a hack just provides 102400 Byte
-        if target.use_dummy_profiling_results():
-            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
-            logger.info(__name__, f"Select minimal algo {algo} for CI")
-            return (algo, 102400)
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
         tmp_op = self._attrs["op_instance"][tmp_key]
         exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
         split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
         query = ConvQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
             kh=self._attrs["KH"],
             kw=self._attrs["KW"],
             co=self._attrs["CO"],
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             op_type=self._attrs["op"],
             device=target._arch,
             epilogue=tmp_op.epilogue_functor.value,
             split_k=split_k,
             exec_entry_sha1=exec_entry_sha1,
+            **self._get_params_factory(),
         )
         cache_value = target.query_profile_cache("conv", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
                 "This is a CI run but we could not find the following cache ",
                 f"available on device {target._arch}\n",
                 f"{op_type} {exec_entry_sha1}.\n",
-                "To bypass, you need to make it available in the db table.",
+                "Please adjust target.select_minimal_algo function.",
             )
-
-        func_key = "{target}.{op}.filter".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        content = list(self._attrs["op_instance"].keys())
-        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
-        x_shape = self._invert_exec_key(exec_key)
-        for cfg in content:
-            if not func(cfg, self._attrs, x_shape):
-                continue
-            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
-            logger.info(__name__, "Running " + " ".join(command))
-            runner.push(cfg, command)
-
+        if target.name() == "rocm":
+            runner = backend.profiler_runner.Runner(
+                devices, self._attrs["name"], timeout=1800
+            )
+            op_type = self._attrs["op"]
+            all_op_names = list(self._attrs["op_instance"].keys())
+            for op_name in all_op_names:
+                x_shape = self._invert_exec_key(exec_key)
+                command = self._gen_profile_cmd(profiler_prefix, op_name, x_shape)
+                runner.push(op_name, command)
+        else:
+            profiler_filename = get_profiler_filename(self._attrs, "conv")
+            runner = backend.profiler_runner.Runner(
+                devices, self._attrs["name"], timeout=180
+            )
+            x_shape = self._invert_exec_key(exec_key)
+            command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
+            runner.push(profiler_filename, command)
         runner.join()
         result = runner.pull()
         if len(result) == 0:
             raise RuntimeError(
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        out = min(result, key=lambda x: x[1].duration)
-        best_algo = out[0]
+        if target.name() == "rocm":
+            out = min(result, key=lambda x: x[1].duration)
+            best_algo = out[0]
+        else:
+            out = min(result, key=itemgetter(1))
+            best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = ConvRecordEntry(
             exec_entry=exec_key,
             exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
             kh=self._attrs["KH"],
             kw=self._attrs["KW"],
             co=self._attrs["CO"],
-            stride=self._attrs["stride"],
-            pad=self._attrs["pad"],
-            dilate=self._attrs["dilate"],
             op_type=self._attrs["op"],
             epilogue=tmp_op.epilogue_functor.value,
             device=target._arch,
             algo=best_algo,
             workspace=workspace,
             split_k=split_k,  # todo add into profile
+            **self._get_params_factory(),
         )
         Target.current().insert_profile_cache("conv", cache_record.__dict__)
         return (best_algo, workspace)
 
+    def _has_dynamic_input_dims(self):
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    return True
+        return False
+
     def profile(
         self,
         workdir="./",
@@ -456,17 +605,7 @@ def profile(
             devices = [0]
         self._profile_static(workdir, devices)
 
-        target = backend.target.Target.current()
-        if target.use_dummy_profiling_results():
-            return
-
-        has_dynamic = False
-        for input_tensor in self._attrs["inputs"]:
-            for dim in input_tensor._attrs["shape"]:
-                if not isinstance(dim, IntImm):
-                    has_dynamic = True
-                    break
-        if has_dynamic:
+        if self._has_dynamic_input_dims():
             if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
                 raise NotImplementedError(
                     "conv2d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
@@ -480,37 +619,41 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        target = backend.target.Target.current()
         if "op_instance" not in self._attrs:
-            target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            best_algo, workspace = self._profile_single_workload(
-                profiler_prefix, wkl, devices
-            )
-            self._attrs["exec_path"][wkl] = best_algo
-            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results() and not force_cache:
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl] = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl] == "":
+                best_algo, workspace = self._profile_single_workload(
+                    profiler_prefix, wkl, devices, force_cache
+                )
+                self._attrs["exec_path"][wkl] = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
 
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
 
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # extract dynamic dim from exec_path
-        if len(self._attrs["exec_path"]) <= 1:
-            return
-
         def _extract_dynamic_dim(exec_keys):
-            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
             var_dims = [[], [], [], []]
             for key in exec_keys:
                 dims = self._invert_exec_key(key)
@@ -518,11 +661,41 @@ def _extract_dynamic_dim(exec_keys):
                     var_dims[i].append(v)
             return var_dims
 
+        dim_lbs = self._attrs["dim_lower_bounds"]
         dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
-        dim1 = dims[1][0]
-        dim2 = dims[2][0]
+        dim0_lb = dim_lbs[0]
+        dim1_lb = dim_lbs[1]
+        dim2_lb = dim_lbs[2]
+        # dims' upper bounds are the same except the batch dimension
+        dim1_ub = dims[1][0]
+        dim2_ub = dims[2][0]
         dim3 = dims[3][0]
+
+        num_exec_path = len(self._attrs["exec_path"])
+        if num_exec_path < 1:
+            return
         algos = list(self._attrs["exec_path"].values())
+        if num_exec_path == 1 or len(set(algos)) <= 1:
+            # all exec paths point to the same algo
+            new_exec_paths = OrderedDict()
+            # Because we have a single algo, it's safe to just take the upper
+            # bound of dim0 (i.e. batch dim) values.
+            dim0_ub = max(dims[0])
+            # we need to generate new exec paths that ensure the ranges of
+            # likely dynamic heights and weights
+            new_key = self._gen_dyn_exec_key(
+                dim0_lb, dim0_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
+            )
+            new_exec_paths[new_key] = algos[0]
+            self._attrs["exec_path"] = new_exec_paths
+            return
+
+        target = backend.target.Target.current()
+        if target.use_dummy_profiling_results():
+            return
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # generate region
         regions = []  # lb, ub, lb_algos, ub_algos
         for i in range(len(dims[0]) - 1):
@@ -539,34 +712,38 @@ def _extract_dynamic_dim(exec_keys):
             last_mid = mid
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3]
-                logger.info(
-                    __name__,
+                mid_shape = [mid, dim1_ub, dim2_ub, dim3]
+                _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
                     ),
                 )
 
-                mid_lb_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(lb_algo), mid_shape
+                # run the profiler binary with all ops on the mid_shape
+                # and fetch the results only for the lb_algo and ub_algo
+                profiler_filename = get_profiler_filename(self._attrs, "conv")
+                profiler_cmd = self._gen_profile_cmd(
+                    profiler_prefix, profiler_filename, mid_shape
                 )
-                mid_ub_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(ub_algo), mid_shape
+                runner.push(
+                    idx=profiler_filename,
+                    cmd=profiler_cmd,
+                    return_ops=[str(lb_algo), str(ub_algo)],
                 )
-                runner.push(0, mid_lb_algo_cmd)
-                runner.push(1, mid_ub_algo_cmd)
                 runner.join()
                 result = runner.pull()
-                assert len(result) >= 1
+                result_dict = {res.op_config: res for res in result[0][1]}
+
+                assert len(result_dict) >= 1
                 # if there is only one result, assume ub algo failed.
-                if len(result) == 1:
-                    assert result[0][0] == 0
+                if len(result_dict) == 1:
+                    assert str(ub_algo) not in result_dict
                     # last_lb = lb
                     lb = mid + 1
                 # if there are two result, compare to decide new lb/ub
                 else:
-                    lb_time = result[0][1]
-                    ub_time = result[1][1]
+                    lb_time = result_dict[str(lb_algo)].duration
+                    ub_time = result_dict[str(ub_algo)].duration
                     if lb_time < ub_time:
                         # lb algo can work with larger batch
                         # last_lb = lb
@@ -578,10 +755,10 @@ def _extract_dynamic_dim(exec_keys):
                 last_mid = mid
                 mid = (lb + ub) // 2
             lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3
+                origin_lb, last_mid, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
             )
             up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3
+                last_mid, origin_ub, dim1_lb, dim1_ub, dim2_lb, dim2_ub, dim3
             )
             new_exec_paths[lo_region_key] = lb_algo
             new_exec_paths[up_region_key] = ub_algo
@@ -596,15 +773,15 @@ def _extract_dynamic_dim(exec_keys):
             #         runner.join()
             #         out = runner.pull()
             #         if len(out) == 0:
-            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             _LOGGER.info("Find specail case: batch=%d" % i)
             #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
             #             special_cases[self._gen_exec_key(x_shape)] = algo
 
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
             #         ub=last_mid))
             # _find_special_case(origin_lb, last_mid, lb_algo)
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
             #         ub=origin_ub))
             # _find_special_case(last_mid, origin_ub, ub_algo)
@@ -623,3 +800,11 @@ def gen_function(self) -> str:
             self.shape_eval_template,
             self.shape_save_template,
         )
+
+
+def _maybe_int_to_tuple(x: Union[int, Tuple[int, int]], name: str) -> Tuple[int, int]:
+    if isinstance(x, int):
+        return x, x
+    if isinstance(x, tuple) and len(x) == 2:
+        return x
+    raise ValueError(f"{name} should be either int or tuple of 2 ints, but got {x}")
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
index 9628df362..416066f29 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias.py
@@ -15,7 +15,9 @@
 """
 Conv2d with bias.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
index 39f7c2a95..9a1dffafc 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_add op
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
index b8f224a7a..36a59445c 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_hardswish.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_add_hardswish op, for residual block
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
index c118716a6..150e10554 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_add_relu.py
@@ -15,7 +15,9 @@
 """
 fused conv2d_bias_relu_add op, for residual block
 """
-from .common_conv2d_bias_add_activation import conv2d_bias_add_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_add_activation import (
+    conv2d_bias_add_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
index 1328d5f53..fb34f4625 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_few_channels.py
@@ -15,7 +15,10 @@
 """
 Fused conv2d_bias_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
+
 
 # pylint: disable=C0103
 class conv2d_bias_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
index b36039cb3..e6039ade5 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_hardswish op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
index 104bf7ef1..ac79c62ac 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_hardswish_few_channels.py
@@ -15,7 +15,10 @@
 """
 Fused conv2d_bias_hardswish_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
+
 
 # pylint: disable=C0103
 class conv2d_bias_hardswish_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
index b8fdf7d75..ab9fdcb94 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_relu op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
index b4d5f9594..d915b80fe 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_relu_few_channels.py
@@ -15,7 +15,10 @@
 """
 Fused conv2d_bias_relu_few_channels op.
 """
-from .special_conv2d_bias_activation import special_conv2d_bias_activation
+from aitemplate.compiler.ops.conv.special_conv2d_bias_activation import (
+    special_conv2d_bias_activation,
+)
+
 
 # pylint: disable=C0103
 class conv2d_bias_relu_few_channels(special_conv2d_bias_activation):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
index 521fd642d..55e009d91 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_bias_sigmoid.py
@@ -15,7 +15,9 @@
 """
 Fused conv2d_bias_sigmoid op.
 """
-from .common_conv2d_bias_activation import conv2d_bias_activation
+from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
+    conv2d_bias_activation,
+)
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
index 35bc350e8..ca2117e05 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise.py
@@ -17,8 +17,9 @@
 """
 from typing import List, Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+
 
 # pylint: disable=C0103
 class conv2d_depthwise(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
index 73ddaa04c..505f0b976 100644
--- a/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
+++ b/python/aitemplate/compiler/ops/conv/conv2d_depthwise_bias.py
@@ -17,8 +17,9 @@
 """
 from typing import List, Tuple
 
-from ...base import Tensor
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+
 
 # pylint: disable=C0103
 class conv2d_depthwise_bias(conv2d):
diff --git a/python/aitemplate/compiler/ops/conv/conv3d.py b/python/aitemplate/compiler/ops/conv/conv3d.py
index f8b5d5e86..f35862c5d 100644
--- a/python/aitemplate/compiler/ops/conv/conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/conv3d.py
@@ -16,6 +16,7 @@
 Base class for conv3d.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -24,16 +25,29 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import logger, shape_utils
-from ...base import DynamicProfileStrategy, IntImm, IntVar, Operator, Tensor
-from .cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.conv.cache_entry import Conv3dQueryEntry, Conv3dRecordEntry
+from aitemplate.compiler.ops.conv.conv_common import (
+    filter_op_instances,
+    generate_profiler_sources,
+    get_profiler_filename,
+)
+from aitemplate.utils import alignment, environ, shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
 
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
 {{indent}}{{dtype}}NI = {{x_dim0}};
@@ -82,7 +96,11 @@
 
 EXEC_DYN_KEY_TEMPLATE = jinja2.Template(
     """
-NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} && DI == {{x_dim1}} && HI == {{x_dim2}} && WI == {{x_dim3}} && CI == {{x_dim4}}
+NI >= {{x_dim0_lb}} && NI <= {{x_dim0_ub}} &&
+ DI >= {{x_dim1_lb}} && DI <= {{x_dim1_ub}} &&
+ HI >= {{x_dim2_lb}} && HI <= {{x_dim2_ub}} &&
+ WI >= {{x_dim3_lb}} && WI <= {{x_dim3_ub}} &&
+ CI == {{x_dim4}}
 """
 )
 
@@ -190,7 +208,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
@@ -211,13 +229,40 @@ def _gen_exec_key(self, shape: List[int]):
             x_dim4=shape[4],
         ).replace("\n", "")
 
-    def _gen_dyn_exec_key(self, dim0_lb, dim0_ub, dim1, dim2, dim3):
+    def _gen_dyn_exec_key(
+        self,
+        dim0_lb,
+        dim0_ub,
+        dim1_lb,
+        dim1_ub,
+        dim2_lb,
+        dim2_ub,
+        dim3_lb,
+        dim3_ub,
+        dim4,
+    ):
         return self.exec_dyn_key_template.render(
-            x_dim0_lb=dim0_lb, x_dim0_ub=dim0_ub, x_dim1=dim1, x_dim2=dim2, x_dim3=dim3
+            x_dim0_lb=dim0_lb,
+            x_dim0_ub=dim0_ub,
+            x_dim1_lb=dim1_lb,
+            x_dim1_ub=dim1_ub,
+            x_dim2_lb=dim2_lb,
+            x_dim2_ub=dim2_ub,
+            x_dim3_lb=dim3_lb,
+            x_dim3_ub=dim3_ub,
+            x_dim4=dim4,
         ).replace("\n", "")
 
     def _extract_exec_path(self, x: Tensor):
         x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        # FIXME: we take the max height and weight for profiling at the moment.
+        # Let's figure out a better profiling strategy later.
+        # The following attribute is temporarily used to hold the lower bounds of
+        # all dimensions. We will remove them later once we have a better profiling
+        # strategy.
+        self._attrs["dim_lower_bounds"] = [min(vals) for vals in x_shape_values]
+        x_shape_values = [x_shape_values[0]] + [[max(vs)] for vs in x_shape_values[1:]]
+
         x_shapes = itertools.product(*x_shape_values)
         self._attrs["exec_path"] = OrderedDict()
         for x_shape in x_shapes:
@@ -243,13 +288,10 @@ def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         epilogue_dim = output_shape[-1]
         if not isinstance(epilogue_dim, IntImm):
             raise RuntimeError("Conv output last dimension must be static!")
-        shape = epilogue_dim._attrs["values"][0]
-        if shape % 8 == 0:
-            self._attrs["epilogue_alignment"] = 8
-        elif shape % 4 == 0:
-            self._attrs["epilogue_alignment"] = 4
-        elif shape % 2 == 0:
-            self._attrs["epilogue_alignment"] = 2
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(
+            number=epilogue_dim._attrs["values"][0],
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
 
     def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         """Call conv3d with tensors x, w
@@ -271,7 +313,7 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -285,6 +327,83 @@ def _get_op_attributes(self) -> Dict[str, Any]:
 
         return attr
 
+    def _should_build_profiler(self) -> bool:
+        """
+        Check if we should build profilers. If we have a cached
+        entry for this gemm instance, we update this gemm op's
+        relevant attributes with the cached result and return False.
+        """
+        force_cache = environ.force_profiler_cache()
+        if self._has_dynamic_input_dims():
+            if force_cache:
+                raise RuntimeError(
+                    "We cannot force to use the cache as dynamic dims require "
+                    "us to generate and build the profilers"
+                )
+            # If there are dynamic dims, we'll have to generate and build the
+            # profilers, as the binaries will be needed for dynamic profiling.
+            return True
+        # We are forced to use the cache so we skip building profilers.
+        if force_cache:
+            return False
+
+        target = backend.target.Target.current()
+        workloads = list(self._attrs["exec_path"].keys())
+
+        build_profiler = True
+        # Now, let's query if all of our workloads have cache entries. If that
+        # is the case, it is safely to skip generating and building profilers.
+        if not target.use_dummy_profiling_results():
+            tmp_key = next(iter(self._attrs["op_instance"].keys()))
+            tmp_op = self._attrs["op_instance"][tmp_key]
+            build_profiler = False
+            for wkl in workloads:
+                exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
+                split_k = (
+                    1 if self._attrs["split_k"] is None else self._attrs["split_k"]
+                )
+                query = Conv3dQueryEntry(
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+                    - 1,
+                    major_a=tmp_op.A.layout.value,
+                    major_b=tmp_op.B.layout.value,
+                    major_c=tmp_op.C.layout.value,
+                    kd=self._attrs["KD"],
+                    kh=self._attrs["KH"],
+                    kw=self._attrs["KW"],
+                    co=self._attrs["CO"],
+                    stride_d=self._attrs["stride"][0],
+                    stride_h=self._attrs["stride"][1],
+                    stride_w=self._attrs["stride"][2],
+                    pad_d=self._attrs["pad"][0],
+                    pad_h=self._attrs["pad"][1],
+                    pad_w=self._attrs["pad"][2],
+                    dilate_d=self._attrs["dilate"][0],
+                    dilate_h=self._attrs["dilate"][1],
+                    dilate_w=self._attrs["dilate"][2],
+                    op_type=self._attrs["op"],
+                    device=target._arch,
+                    epilogue=tmp_op.epilogue_functor.value,
+                    split_k=split_k,
+                    exec_entry_sha1=exec_entry_sha1,
+                )
+                cache_value = target.query_profile_cache("conv3d", query.__dict__)
+                if cache_value is not None and not target.force_profile():
+                    _LOGGER.info(
+                        f'Load profiling result for {self._attrs["name"]} '
+                        f"from cache: {cache_value}",
+                    )
+                    best_algo, workspace = cache_value
+                    self._attrs["exec_path"][wkl] = best_algo
+                    self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+                else:
+                    # cache miss - we will have to generate and build profilers
+                    build_profiler = True
+        return build_profiler
+
     def gen_profiler(
         self,
         workdir: str = None,
@@ -305,12 +424,22 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs)
-        func_key = "{target}.{op}.gen_profiler".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        return func(self._attrs, workdir, self.shape_eval_template)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+
+        if self._should_build_profiler():
+            x_shapes = [
+                self._invert_exec_key(exec_key) for exec_key in self._attrs["exec_path"]
+            ]
+            self._attrs["op_instance"] = filter_op_instances(
+                func_attrs=self._attrs,
+                x_shapes=x_shapes,
+            )
+            return generate_profiler_sources(
+                func_attrs=self._attrs,
+                op_class="conv3d",
+                workdir=workdir,
+                shape_template=self.shape_eval_template,
+            )
 
     def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         exe_path = os.path.join(profiler_prefix, cfg)
@@ -339,24 +468,19 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         command = [str(x) for x in cmd]
         return command
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, devices):
+    def _profile_single_workload(self, profiler_prefix, exec_key, devices, force_cache):
         target = backend.target.Target.current()
-        # if in CI just choose minimal configs
-        # workspace is a hack just provides 102400 Byte
-        if target.use_dummy_profiling_results():
-            algo = target.select_minimal_algo(list(self._attrs["op_instance"].keys()))
-            logger.info(__name__, f"Select minimal algo {algo} for CI")
-            return (algo, 102400)
         # query cache
         tmp_key = next(iter(self._attrs["op_instance"].keys()))
         tmp_op = self._attrs["op_instance"][tmp_key]
         exec_entry_sha1 = sha1(exec_key.encode("utf-8")).hexdigest()
         split_k = 1 if self._attrs["split_k"] is None else self._attrs["split_k"]
         query = Conv3dQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+            - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -381,8 +505,14 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("conv3d", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
@@ -391,19 +521,22 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
                 f"{op_type} {exec_entry_sha1}.\n",
                 "To bypass, you need to make it available in the db table.",
             )
-
-        func_key = "{target}.{op}.filter".format(
-            target=target.name(), op=self._attrs["op"]
-        )
-        func = registry.get(func_key)
-        content = list(self._attrs["op_instance"].keys())
-        runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
-        x_shape = self._invert_exec_key(exec_key)
-        for cfg in content:
-            if not func(cfg, self._attrs, x_shape):
-                continue
-            command = self._gen_profile_cmd(profiler_prefix, cfg, x_shape)
-            runner.push(cfg, command)
+        if target.name() == "rocm":
+            op_type = self._attrs["op"]
+            all_op_names = list(self._attrs["op_instance"].keys())
+            for op_name in all_op_names:
+                runner = backend.profiler_runner.Runner(
+                    devices, self._attrs["name"], timeout=1800
+                )
+                x_shape = self._invert_exec_key(exec_key)
+                command = self._gen_profile_cmd(profiler_prefix, op_name, x_shape)
+                runner.push(op_name, command)
+        else:
+            profiler_filename = get_profiler_filename(self._attrs, "conv3d")
+            runner = backend.profiler_runner.Runner(devices, self._attrs["name"])
+            x_shape = self._invert_exec_key(exec_key)
+            command = self._gen_profile_cmd(profiler_prefix, profiler_filename, x_shape)
+            runner.push(profiler_filename, command)
 
         runner.join()
         result = runner.pull()
@@ -411,17 +544,24 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
             raise RuntimeError(
                 "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
             )
-        out = min(result, key=lambda x: x[1].duration)
-        best_algo = out[0]
+        if target.name() == "rocm":
+            out = min(result, key=lambda x: x[1].duration)
+            best_algo = out[0]
+        else:
+            from operator import itemgetter
+
+            out = min(result, key=itemgetter(1))
+            best_algo = out[1].op_config
         workspace = out[1].workspace
         ## cache
         cache_record = Conv3dRecordEntry(
             exec_entry=exec_key,
             exec_entry_sha1=exec_entry_sha1,
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value,
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.tile_description.math_instruction.element_accumulator.value
+            - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -448,6 +588,13 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         Target.current().insert_profile_cache("conv3d", cache_record.__dict__)
         return (best_algo, workspace)
 
+    def _has_dynamic_input_dims(self):
+        for input_tensor in self._attrs["inputs"]:
+            for dim in input_tensor._attrs["shape"]:
+                if not isinstance(dim, IntImm):
+                    return True
+        return False
+
     def profile(
         self,
         workdir="./",
@@ -458,13 +605,7 @@ def profile(
             devices = [0]
         self._profile_static(workdir, devices)
 
-        has_dynamic = False
-        for input_tensor in self._attrs["inputs"]:
-            for dim in input_tensor._attrs["shape"]:
-                if not isinstance(dim, IntImm):
-                    has_dynamic = True
-                    break
-        if has_dynamic:
+        if self._has_dynamic_input_dims():
             if dynamic_profiling_strategy != DynamicProfileStrategy.HINTS:
                 raise NotImplementedError(
                     "conv3d only supports HINTS dynamic profiling strategy for now! Current strategy: {}".format(
@@ -478,49 +619,95 @@ def _profile_static(self, workdir, devices):
 
         workloads = list(self._attrs["exec_path"].keys())
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        if "op_instance" not in self._attrs:
-            target = backend.target.Target.current()
+        target = backend.target.Target.current()
+        if "op_instance" not in self._attrs or len(self._attrs["op_instance"]) == 0:
             # init candidate ops
             func_key = "{target}.{op}.config".format(
                 target=target.name(), op=self._attrs["op"]
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            best_algo, workspace = self._profile_single_workload(
-                profiler_prefix, wkl, devices
-            )
-            self._attrs["exec_path"][wkl] = best_algo
-            self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
+            # if in CI just choose minimal configs
+            # workspace is a hack just provides 102400 Byte
+            if target.use_dummy_profiling_results() and not force_cache:
+                algo = target.select_minimal_algo(
+                    list(self._attrs["op_instance"].keys())
+                )
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
+                self._attrs["exec_path"][wkl] = algo
+                self._attrs["workspace"] = 102400
+            elif self._attrs["exec_path"][wkl] == "":
+                best_algo, workspace = self._profile_single_workload(
+                    profiler_prefix, wkl, devices, force_cache
+                )
+                self._attrs["exec_path"][wkl] = best_algo
+                self._attrs["workspace"] = max(self._attrs["workspace"], workspace)
 
     def _profile_dynamic_dim(self, workdir):
         """Profiles with dynamic shapes."""
 
-        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
-        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # extract dynamic dim from exec_path
-        if len(self._attrs["exec_path"]) <= 1:
-            return
-
         def _extract_dynamic_dim(exec_keys):
-            logger.info(__name__, "ONLY SUPPORT DYNAMIC BATCH (dim0)!")
-            var_dims = [[], [], [], []]
+            _LOGGER.info("ONLY SUPPORT DYNAMIC BATCH (dim0)!")
+            var_dims = [[], [], [], [], []]
             for key in exec_keys:
                 dims = self._invert_exec_key(key)
                 for i, v in enumerate(dims):
                     var_dims[i].append(v)
             return var_dims
 
+        dim_lbs = self._attrs["dim_lower_bounds"]
         dims = _extract_dynamic_dim(self._attrs["exec_path"].keys())
-        dim1 = dims[1][0]
-        dim2 = dims[2][0]
-        dim3 = dims[3][0]
+        dim0_lb = dim_lbs[0]
+        dim1_lb = dim_lbs[1]
+        dim2_lb = dim_lbs[2]
+        dim3_lb = dim_lbs[3]
+        # dims' upper bounds are the same except the batch dimension
+        dim1_ub = dims[1][0]
+        dim2_ub = dims[2][0]
+        dim3_ub = dims[3][0]
+        dim4 = dims[4][0]
+
+        num_exec_path = len(self._attrs["exec_path"])
+        if num_exec_path < 1:
+            return
+
         algos = list(self._attrs["exec_path"].values())
+        if num_exec_path == 1 or len(set(algos)) <= 1:
+            # all exec paths point to the same algo
+            new_exec_paths = OrderedDict()
+            # Because we have a single algo, it's safe to just take the upper
+            # bound of dim0 (i.e. batch dim) values.
+            dim0_ub = max(dims[0])
+            # We need to generate new exec paths that ensure the ranges of
+            # likely dynamic heights and weights.
+            new_key = self._gen_dyn_exec_key(
+                dim0_lb,
+                dim0_ub,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
+            )
+            new_exec_paths[new_key] = algos[0]
+            self._attrs["exec_path"] = new_exec_paths
+            return
+
+        target = backend.target.Target.current()
+        if target.use_dummy_profiling_results():
+            return
+
+        profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
+        runner = backend.profiler_runner.Runner([0], self._attrs["name"])
         # generate region
         regions = []  # lb, ub, lb_algos, ub_algos
         for i in range(len(dims[0]) - 1):
@@ -537,34 +724,38 @@ def _extract_dynamic_dim(exec_keys):
             last_mid = mid
             while mid > lb and mid < ub:
                 mid = (lb + ub) // 2
-                mid_shape = [mid, dim1, dim2, dim3]
-                logger.info(
-                    __name__,
+                mid_shape = [mid, dim1_ub, dim2_ub, dim3_ub, dim4]
+                _LOGGER.info(
                     "current: lb_algo: {lb_algo}, LB:{lb} MID:{mid} UB:{ub}".format(
                         lb_algo=lb_algo, lb=lb, mid=mid, ub=ub
                     ),
                 )
 
-                mid_lb_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(lb_algo), mid_shape
+                # run the profiler binary with all ops on the mid_shape
+                # and fetch the results only for the lb_algo and ub_algo
+                profiler_filename = get_profiler_filename(self._attrs, "conv3d")
+                profiler_cmd = self._gen_profile_cmd(
+                    profiler_prefix, profiler_filename, mid_shape
                 )
-                mid_ub_algo_cmd = self._gen_profile_cmd(
-                    profiler_prefix, str(ub_algo), mid_shape
+                runner.push(
+                    idx=profiler_filename,
+                    cmd=profiler_cmd,
+                    return_ops=[str(lb_algo), str(ub_algo)],
                 )
-                runner.push(0, mid_lb_algo_cmd)
-                runner.push(1, mid_ub_algo_cmd)
                 runner.join()
                 result = runner.pull()
-                assert len(result) >= 1
+                result_dict = {res.op_config: res for res in result[0][1]}
+
+                assert len(result_dict) >= 1
                 # if there is only one result, assume ub algo failed.
-                if len(result) == 1:
-                    assert result[0][0] == 0
+                if len(result_dict) == 1:
+                    assert str(ub_algo) not in result_dict
                     # last_lb = lb
                     lb = mid + 1
                 # if there are two result, compare to decide new lb/ub
                 else:
-                    lb_time = result[0][1]
-                    ub_time = result[1][1]
+                    lb_time = result_dict[str(lb_algo)].duration
+                    ub_time = result_dict[str(ub_algo)].duration
                     if lb_time < ub_time:
                         # lb algo can work with larger batch
                         # last_lb = lb
@@ -576,10 +767,26 @@ def _extract_dynamic_dim(exec_keys):
                 last_mid = mid
                 mid = (lb + ub) // 2
             lo_region_key = self._gen_dyn_exec_key(
-                origin_lb, last_mid, dim1, dim2, dim3
+                origin_lb,
+                last_mid,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
             )
             up_region_key = self._gen_dyn_exec_key(
-                last_mid, origin_ub, dim1, dim2, dim3
+                last_mid,
+                origin_ub,
+                dim1_lb,
+                dim1_ub,
+                dim2_lb,
+                dim2_ub,
+                dim3_lb,
+                dim3_ub,
+                dim4,
             )
             new_exec_paths[lo_region_key] = lb_algo
             new_exec_paths[up_region_key] = ub_algo
@@ -588,21 +795,21 @@ def _extract_dynamic_dim(exec_keys):
             # So far I find binary search works well.
             # def _find_special_case(lb, ub, algo):
             #     for i in range(lb + 1, ub + 1):
-            #         x_shape = [i, dim1, dim2, dim3]
+            #         x_shape = [i, dim1, dim2, dim3, dim4]
             #         cmd = self._gen_profile_cmd(profiler_prefix, str(algo), x_shape)
             #         runner.push(0, cmd)
             #         runner.join()
             #         out = runner.pull()
             #         if len(out) == 0:
-            #             logger.info(self._attrs["name"], "Find specail case: batch=%d" % i)
+            #             _LOGGER.info(Find specail case: batch=%d" % i)
             #             algo = self._profile_single_workload(profiler_prefix, x_shape, [0])
             #             special_cases[self._gen_exec_key(x_shape)] = algo
 
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=origin_lb,
             #         ub=last_mid))
             # _find_special_case(origin_lb, last_mid, lb_algo)
-            # logger.info(self._attrs["name"],
+            # _LOGGER.info(
             #     "Searching for specail cases between [{lb}, {ub}]".format(lb=last_mid + 1,
             #         ub=origin_ub))
             # _find_special_case(last_mid, origin_ub, ub_algo)
diff --git a/python/aitemplate/compiler/ops/conv/conv3d_bias.py b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
new file mode 100644
index 000000000..7c5a41362
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv3d_bias.py
@@ -0,0 +1,71 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Conv3d with bias.
+"""
+from typing import List
+
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.compiler.ops.conv.conv3d import conv3d
+
+
+class conv3d_bias(conv3d):
+    r"""conv3d_bias"""
+
+    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+        """Conv3d constructor.
+
+        Parameters
+        ----------
+        stride : int or tuple
+            Stride of the convolution
+        pad : int or tuple
+            Size of padding to add to the input
+        dilate : int ot tuple, optional
+            Size of spacing between kernel elements, by default 1
+        group : int, optional
+           Number of blocked connections from input
+            channels to output channels, by default 1
+        """
+        super().__init__(stride, pad, dilate=dilate, group=group)
+        self._attrs["op"] = "conv3d_bias"
+
+    def __call__(self, x: Tensor, w: Tensor, b: Tensor) -> List[Tensor]:
+        """Call conv3d_bias with tensors x, w, b
+
+        Parameters
+        ----------
+        x : Tensor
+            in shape (N, D, H, W, C_in)
+        w : Tensor
+            in shape (C_out, K_d, K_h, K_w, C_in)
+        b : Tensor
+            in shape (C_out)
+
+        Returns
+        -------
+        List[Tensor]
+            includes the output tensor in shape (N, D_out, H_out, W_out, C_out)
+        """
+        self._attrs["inputs"] = [x, w, b]
+        self._set_depth()
+        output_shape = self._infer_shapes(x, w)
+        self._extract_exec_path(x)
+        self._extract_epilogue_alignment(output_shape)
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        self._attrs["outputs"] = [output]
+        return output
diff --git a/python/aitemplate/compiler/ops/conv/conv_common.py b/python/aitemplate/compiler/ops/conv/conv_common.py
new file mode 100644
index 000000000..0439a91e9
--- /dev/null
+++ b/python/aitemplate/compiler/ops/conv/conv_common.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import logging
+from hashlib import sha1
+
+from aitemplate import backend
+from aitemplate.backend import registry
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def get_profiler_filename(func_attrs, op_class):
+    """
+    Generate a filename for a profiler that benchmarks multiple instances.
+    """
+    target = backend.target.Target.current()
+
+    op_type = func_attrs["op"]
+    all_op_names = list(func_attrs["op_instance"].keys())
+    encoded_str = sha1((";".join(all_op_names)).encode("utf-8")).hexdigest()
+
+    if target.use_dummy_profiling_results():
+        # we don't use cache
+        return f"{op_type}_{encoded_str}"
+    else:
+        cache_ver = target.get_profile_cache_version(op_class)
+        return f"{op_type}_{encoded_str}_{cache_ver}"
+
+
+def filter_op_instances(func_attrs, x_shapes):
+    """
+    Filter out some of the func's op instances using the filter function.
+    """
+    target = backend.target.Target.current()
+    func_key = "{target}.{op}.filter".format(
+        target=target.name(),
+        op=func_attrs["op"],
+    )
+    filter_func = registry.get(func_key)
+
+    op_names_to_keep = set()
+    for x_shape in x_shapes:
+        for op_name in func_attrs["op_instance"]:
+            if filter_func(op_name, func_attrs, x_shape):
+                op_names_to_keep.add(op_name)
+
+    return {
+        op_name: op
+        for op_name, op in func_attrs["op_instance"].items()
+        if op_name in op_names_to_keep
+    }
+
+
+def generate_profiler_sources(func_attrs, op_class, workdir, shape_template):
+    """
+    Generate profiler sources for the func.
+    """
+    target = backend.target.Target.current()
+    func_key = "{target}.{op}.gen_profiler".format(
+        target=target.name(),
+        op=func_attrs["op"],
+    )
+    gen_profiler_func = registry.get(func_key)
+
+    if target.name() == "rocm":
+        return gen_profiler_func(func_attrs, workdir, shape_template)
+
+    profiler_filename = get_profiler_filename(func_attrs, op_class)
+    _LOGGER.info(f"generating {profiler_filename=}")
+    return gen_profiler_func(
+        func_attrs,
+        workdir,
+        profiler_filename,
+        shape_template,
+    )
diff --git a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
index 4d37a5334..fe9b5a3b3 100644
--- a/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
+++ b/python/aitemplate/compiler/ops/conv/depthwise_conv3d.py
@@ -22,10 +22,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0102,W1202,C0301,R1716
 
@@ -94,7 +94,7 @@
 class depthwise_conv3d(Operator):
     r"""depthwise_conv3d"""
 
-    def __init__(self, stride, pad, dilate=1, group=1) -> None:
+    def __init__(self, stride, pad, dilate=1, group=1, bias=False) -> None:
         """Conv3d constructor.
 
         Parameters
@@ -110,7 +110,7 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
             channels to output channels, by default 1
         """
         super().__init__()
-        self._attrs["op"] = "depthwise_conv3d"
+        self._attrs["op"] = "depthwise_conv3d_bias" if bias else "depthwise_conv3d"
         self._attrs["stride"] = stride
         if isinstance(stride, int):
             self._attrs["stride"] = (stride, stride, stride)
@@ -126,6 +126,7 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self._attrs["epilogue"] = "LinearCombination"
         self._attrs["workspace"] = 0
         self._attrs["split_k"] = None
+        self._attrs["bias"] = bias
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
         self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
         self.exec_key_template = EXEC_KEY_TEMPLATE
@@ -186,7 +187,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
@@ -247,7 +248,7 @@ def _extract_epilogue_alignment(self, output_shape: List[IntVar]) -> None:
         elif shape % 2 == 0:
             self._attrs["epilogue_alignment"] = 2
 
-    def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
+    def __call__(self, x: Tensor, w: Tensor, bias: Tensor = None) -> List[Tensor]:
         """Call depthwise_conv3d with tensors x, w
 
         Parameters
@@ -263,16 +264,18 @@ def __call__(self, x: Tensor, w: Tensor) -> List[Tensor]:
             includes the output tensor in shape (N, T_out, H_out, W_out, C_out)
         """
         self._attrs["inputs"] = [x, w]
+        if bias:
+            self._attrs["inputs"].append(bias)
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
     def _get_op_attributes(self) -> Dict[str, Any]:
-        target_attrs = ["dilate", "group", "pad", "stride"]
+        target_attrs = ["dilate", "group", "pad", "stride", "bias"]
         attr = {}
 
         for target_attr in target_attrs:
diff --git a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
index 52175059f..f0b402820 100644
--- a/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
+++ b/python/aitemplate/compiler/ops/conv/special_conv2d_bias_activation.py
@@ -15,9 +15,10 @@
 """
 Fused special_conv2d_bias_activation op.
 """
-from ...base import Tensor
-from ..padding import nhwc3to4, nhwc3to8
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+from aitemplate.compiler.ops.padding import nhwc3to4, nhwc3to8
+
 
 # pylint: disable=C0103
 class special_conv2d_bias_activation(conv2d):
@@ -80,7 +81,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
index 23c8ab1fc..7a4a3750e 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d.py
@@ -15,9 +15,16 @@
 """
 Transposed conv2d op.
 """
+
+import itertools
+from typing import List
+
 import jinja2
 
-from .conv2d import conv2d
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.conv.conv2d import conv2d
+
+from aitemplate.utils import shape_utils
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
@@ -28,12 +35,12 @@
 {{indent}}{{dtype}}CO = {{w_dim0}};
 {{indent}}{{dtype}}KH = {{w_dim1}};
 {{indent}}{{dtype}}KW = {{w_dim2}};
-{{indent}}{{dtype}}SH = {{stride}};
-{{indent}}{{dtype}}SW = {{stride}};
-{{indent}}{{dtype}}DH = {{dilate}};
-{{indent}}{{dtype}}DW = {{dilate}};
-{{indent}}{{dtype}}PH = {{pad}};
-{{indent}}{{dtype}}PW = {{pad}};
+{{indent}}{{dtype}}SH = {{strideh}};
+{{indent}}{{dtype}}SW = {{stridew}};
+{{indent}}{{dtype}}DH = {{dilateh}};
+{{indent}}{{dtype}}DW = {{dilatew}};
+{{indent}}{{dtype}}PH = {{padh}};
+{{indent}}{{dtype}}PW = {{padw}};
 {{indent}}{{dtype}}KHEff = (KH - 1) * DH + 1;
 {{indent}}{{dtype}}KWEff = (KW - 1) * DW + 1;
 {{indent}}{{dtype}}NO = NI;
@@ -42,6 +49,7 @@
 """
 )
 
+
 # pylint: disable=C0103
 class transposed_conv2d(conv2d):
     r"""Transposed conv2d.
@@ -109,3 +117,52 @@ def __init__(self, stride, pad, dilate=1, group=1) -> None:
         self._attrs["op"] = "transposed_conv2d"
         self._attrs["epilogue"] = "LinearCombination"
         self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+
+    def _infer_shape(self, x: List[int], w: List[int]) -> List[int]:
+        if x[3] != w[0] * self._attrs["group"]:
+            raise RuntimeError("X/W Shape mismatch for conv2d")
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            w_dim0=w[3],  # for conv_transpose w = [c_in, kh, kw, c_out]
+            w_dim1=w[1],
+            w_dim2=w[2],
+            **self._get_params_factory(),
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor, w: Tensor) -> List[int]:
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        w_shape = [var._attrs["values"][0] for var in w._attrs["shape"]]
+        self._attrs["CO"] = w_shape[3]
+        self._attrs["KH"] = w_shape[1]
+        self._attrs["KW"] = w_shape[2]
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape, w_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+        return output_shape
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
index 7a1d1c801..caeb948dc 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias.py
@@ -18,12 +18,13 @@
 
 from typing import Tuple
 
+from aitemplate.compiler.base import Tensor
+
 from aitemplate.compiler.ops.conv.common_conv2d_bias_activation import (
     conv2d_bias_activation,
 )
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
 
-from ...base import Tensor
-from .transposed_conv2d import transposed_conv2d
 
 # pylint: disable=C0103
 class transposed_conv2d_bias(transposed_conv2d):
@@ -99,7 +100,7 @@ def __call__(self, x: Tensor, w: Tensor, b: Tensor):
         self._attrs["inputs"] = [x, w, b]
         self._set_depth()
         output_shape = self._infer_shapes(x, w)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._extract_exec_path(x)
         self._extract_epilogue_alignment(output_shape)
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
index b66d8162d..81ea0f61e 100644
--- a/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/compiler/ops/conv/transposed_conv2d_bias_relu.py
@@ -15,7 +15,8 @@
 """
 Fused transposed_conv2d_bias_relu op.
 """
-from .transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
+
 
 # pylint: disable=C0103
 class transposed_conv2d_bias_relu(transposed_conv2d_bias):
diff --git a/python/aitemplate/compiler/ops/embedding/__init__.py b/python/aitemplate/compiler/ops/embedding/__init__.py
index 8e8178f4b..1ff35e6e9 100644
--- a/python/aitemplate/compiler/ops/embedding/__init__.py
+++ b/python/aitemplate/compiler/ops/embedding/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bert_embeddings import bert_embeddings
+from aitemplate.compiler.ops.embedding.bert_embeddings import bert_embeddings
 
 __all__ = [
     "bert_embeddings",
diff --git a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
index 54da519bd..8d8f7f42c 100644
--- a/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
+++ b/python/aitemplate/compiler/ops/embedding/bert_embeddings.py
@@ -15,10 +15,10 @@
 """
 Operator definition for bert_embeddings.
 """
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntImm, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 
 class bert_embeddings(Operator):
@@ -79,9 +79,10 @@ def __call__(
             "int64",
         ], f"Expected dtype int/int32/int64 for index, got dtype {dtype_input_ids}"
 
-        assert (
-            dtype_word_embeddings == "float16"
-        ), f"Expected float16 embeddings, but got {dtype_word_embeddings}"
+        assert dtype_word_embeddings in [
+            "float16",
+            "float32",
+        ], f"Expected dtype float16/float32 for embeddings, got dtype {dtype_word_embeddings}"
 
         # expecting all three ids to have the same shapes
         assert shape_utils.is_same_shape(input_ids.shape(), token_type_ids.shape()), (
@@ -123,7 +124,11 @@ def __call__(
         self._set_depth()
 
         output_shape = self._infer_shapes(input_ids, word_embeddings)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=word_embeddings._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
index 74ea33f2c..f48eb5a98 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/__init__.py
@@ -12,17 +12,29 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .bmm_rcr_softmax import bmm_rcr_softmax
-from .dual_gemm_rcr_fast_gelu import dual_gemm_rcr_fast_gelu
-from .dual_gemm_rcr_silu import dual_gemm_rcr_silu
-from .gemm_rcr_bias_softmax import gemm_rcr_bias_softmax
-from .gemm_rcr_softmax import gemm_rcr_softmax
+from aitemplate.compiler.ops.gemm_epilogue_vistor.bmm_rcr_softmax import bmm_rcr_softmax
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_bmm_rrr_div import (
+    dual_bmm_rrr_div,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_gemm_rcr_fast_gelu import (
+    dual_gemm_rcr_fast_gelu,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.dual_gemm_rcr_silu import (
+    dual_gemm_rcr_silu,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_bias_softmax import (
+    gemm_rcr_bias_softmax,
+)
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_softmax import (
+    gemm_rcr_softmax,
+)
 
 
 __all__ = [
     "bmm_rcr_softmax",
+    "dual_bmm_rrr_div",
+    "dual_gemm_rcr_fast_gelu",
+    "dual_gemm_rcr_silu",
     "gemm_rcr_bias_softmax",
     "gemm_rcr_softmax",
-    "dual_gemm_rcr_silu",
-    "dual_gemm_rcr_fast_gelu",
 ]
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
index c3166b925..6eb2e387c 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/bmm_rcr_softmax.py
@@ -13,16 +13,13 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
-This is special in template based gemm solution
-This is used for `torch.nn.functional.linear`
-When use for `linear`, need set A->Data, B->Weight
+Operator definition for bmm_rcr_softmax.
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal import gemm_common as common
-from ..gemm_universal.bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -36,10 +33,9 @@ class bmm_rcr_softmax(bmm):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "bmm_rcr_softmax"
-        raise Exception("BMM + Softmax is disabled for now")
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -68,24 +64,13 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_c = Tensor(output_shape, dst_ops={self})
-        temp_d = Tensor(output_shape, dst_ops={self})
-        temp_n = Tensor(
-            [output_shape[0], output_shape[1], IntImm(1)],
-            dtype="float32",
-            dst_ops={self},
-        )
-
-        self._attrs["inputs"].append(temp_c)
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
@@ -118,7 +103,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
         def fbuild_cmd(exec_key):
             B, M, N, K = self._invert_exec_key(exec_key)
             cmd = []
-            cmd.append(B)  # m
+            cmd.append(B)  # b
             cmd.append(M)  # m
             cmd.append(N)  # n
             cmd.append(K)  # k
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
new file mode 100644
index 000000000..4be9a4585
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_bmm_rrr_div.py
@@ -0,0 +1,58 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
+"""
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class dual_bmm_rrr_div(bmm_rrr):
+    """Batch GEMM specialization: BMM_RRR(A, B0) / BMM_RRR(A, B1)
+
+    This operator is equivalent to the following pytorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+        A = torch.randn(B, M, K)
+        B0 = torch.randn(B, K, N)
+        B1 = torch.randn(B, K, N)
+        D0 = torch.bmm(A, B0)
+        D1 = torch.bmm(A, B1)
+        D2 = D0 / D1
+
+    If the last dim of B1 is 1 (while the last dim of B0 isn't),
+    B1 is broadcasted to the same shape as B0 before computing
+    the right gemm A @ B1.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "dual_bmm_rrr_div"
+        self._attrs["epilogue2"] = "Div"
+
+    def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(bias)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        if b._attrs["shape"][-1] != 1 and bias._attrs["shape"][-1] == 1:
+            self._attrs["broadcast_b1"] = True
+        return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
index e6e8c1d0e..62f8db0eb 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_fast_gelu.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: FAST_GELU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -34,7 +34,7 @@ class dual_gemm_rcr_fast_gelu(gemm_rcr):
         B = torch.randn(N, K)
         Y1 = torch.nn.functional.linear(A, W)
         Y2 = torch.nn.functional.linear(A, B)
-        Y = torch.nn.functional.silu(Y_1) * Y_2
+        Y = torch.nn.functional.silu(Y1) * Y2
     """
 
     def __init__(self):
@@ -71,7 +71,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
+        if b._attrs["shape"][-2] != 1 and bias._attrs["shape"][-2] == 1:
+            self._attrs["broadcast_b1"] = True
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
index d80a541e2..e847b1acc 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/dual_gemm_rcr_silu.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: SILU(GEMM_RCR(A, B)) * GEMM_RCR(A, B1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -34,7 +34,7 @@ class dual_gemm_rcr_silu(gemm_rcr):
         B = torch.randn(N, K)
         Y1 = torch.nn.functional.linear(A, W)
         Y2 = torch.nn.functional.linear(A, B)
-        Y = torch.nn.functional.silu(Y_1) * Y_2
+        Y = torch.nn.functional.silu(Y1) * Y2
     """
 
     def __init__(self):
@@ -71,7 +71,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=self._attrs["inputs"][0]._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
+        if b._attrs["shape"][-2] != 1 and bias._attrs["shape"][-2] == 1:
+            self._attrs["broadcast_b1"] = True
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
index 358dd891b..76aa740dd 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_bias_softmax.py
@@ -15,9 +15,11 @@
 """
 Operator definition for gemm_rcr_bias_softmax.
 """
-from ...base import _create_host_zero_tensor, Tensor
-from ...tensor_accessor import TensorAccessor
-from .gemm_rcr_softmax import gemm_rcr_softmax
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_epilogue_vistor.gemm_rcr_softmax import (
+    gemm_rcr_softmax,
+)
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120,W0223
 
@@ -57,20 +59,13 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_n = _create_host_zero_tensor(
-            [output_shape[0], 1], dtype="float32", dst_ops={self}
-        )
-
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
index 163238824..929a8a72d 100644
--- a/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
+++ b/python/aitemplate/compiler/ops/gemm_epilogue_vistor/gemm_rcr_softmax.py
@@ -13,15 +13,12 @@
 #  limitations under the License.
 #
 """
-GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
-This is special in template based gemm solution
-This is used for `torch.nn.functional.linear`
-When use for `linear`, need set A->Data, B->Weight
+Operator definition for gemm_rcr_softmax.
 """
 
-from ...base import _create_host_zero_tensor, IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -31,9 +28,9 @@ class gemm_rcr_softmax(gemm_rcr):
 
     def __init__(self):
         """Initializes gemm_rcr_softmax."""
+
         super().__init__()
         self._attrs["op"] = "gemm_rcr_softmax"
-        raise Exception("GEMM + Softmax is disabled for now")
 
     def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         """Performs sanity checks, offline shape inference and returns an output tensor."""
@@ -46,22 +43,13 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
 
-        temp_c = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_d = _create_host_zero_tensor(output_shape, dst_ops={self})
-        temp_n = _create_host_zero_tensor(
-            [output_shape[0], IntImm(1)], dtype="float32", dst_ops={self}
-        )
-
-        self._attrs["inputs"].append(temp_c)
-        self._attrs["inputs"].append(temp_d)
-        self._attrs["inputs"].append(temp_n)
         self._attrs["input_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["inputs"]
         ]
 
         self._set_depth()
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_special/__init__.py b/python/aitemplate/compiler/ops/gemm_special/__init__.py
index 019225be5..b577f5ae0 100644
--- a/python/aitemplate/compiler/ops/gemm_special/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_special/__init__.py
@@ -15,9 +15,17 @@
 """
 special gemm ops
 """
-from .bmm_rcr_n1 import bmm_rcr_n1
-from .bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
-from .gemm_rrr_small_nk import gemm_rrr_small_nk
+from aitemplate.compiler.ops.gemm_special.batched_dense_vec_jagged_2d_mul import (
+    batched_dense_vec_jagged_2d_mul,
+)
+from aitemplate.compiler.ops.gemm_special.bmm_rcr_n1 import bmm_rcr_n1
+from aitemplate.compiler.ops.gemm_special.bmm_rrr_k1_tanh import bmm_rrr_k1_tanh
+from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
 
 
-__all__ = ["bmm_rcr_n1", "bmm_rrr_k1_tanh", "gemm_rrr_small_nk"]
+__all__ = [
+    "batched_dense_vec_jagged_2d_mul",
+    "bmm_rcr_n1",
+    "bmm_rrr_k1_tanh",
+    "gemm_rrr_small_nk",
+]
diff --git a/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..9ea46d170
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_special/batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Define batched_dense_vec_jagged_2d_mul op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class batched_dense_vec_jagged_2d_mul(Operator):
+    """
+    Compute a dense tensor containing batched matrix
+    multiplication of a batched dense vector and
+    a batched jagged matrix.
+
+    Args:
+        vectors (Tensor): batched dense vector of shape [B, H, N].
+        matrices (Tensor): batched jagged matrix of shape [sum_B(N_B), H, D].
+
+    Returns:
+        output (Tensor): dense tensor containing the batched vector /
+        jagged matrix multiplication result of shape [B, H, D].
+    """
+
+    def __init__(
+        self,
+    ):
+        super().__init__()
+        self._attrs["op"] = "batched_dense_vec_jagged_2d_mul"
+
+    def _infer_shape(self, vectors: Tensor, matrices: Tensor) -> List[IntVar]:
+        jagged_int_var = matrices.shape()[0]
+        return [jagged_int_var.batch_dim(), matrices.shape()[1], matrices.shape()[2]]
+
+    def __call__(self, vectors: Tensor, matrices: Tensor) -> Tensor:
+        if not matrices.is_jagged():
+            raise TypeError(
+                f"matrices must be a jagged Tensor, but got a dense Tensor {matrices}."
+            )
+        if vectors.is_jagged():
+            raise TypeError(
+                f"vectors must be a jagged Tensor, but got a jagged Tensor {vectors}."
+            )
+
+        if len(vectors.shape()) != 3:
+            raise ValueError(f"vectors must be rank-3, but got {vectors}.")
+
+        if len(matrices.shape()) != 3:
+            raise ValueError(f"matrices must be rank-3, but got {matrices}.")
+
+        jagged_int_var = matrices.shape()[0]
+        if jagged_int_var.batch_dim() != vectors.shape()[0]:
+            raise RuntimeError(
+                "The batch dim B of the jagged matrices tensor and "
+                "dense vectors tensor must be the same, but got "
+                f"{jagged_int_var.batch_dim()=} != {vectors.shape()[0]=}."
+            )
+
+        if vectors.shape()[1] != matrices.shape()[1]:
+            raise RuntimeError(
+                f"The second dim H of the jagged matrices tensor and "
+                "dense vectors tensor must be the same, but got "
+                f"{matrices.shape()[1]=} != {vectors.shape()[1]}."
+            )
+
+        if vectors.dtype() != matrices.dtype():
+            raise RuntimeError(
+                "vectors and matrices must have the same type, but got "
+                f"{vectors.dtype()=} != {matrices.dtype()=}."
+            )
+
+        if len(jagged_int_var.jagged_dims()) != 1:
+            raise RuntimeError(
+                "Jagged matrices tensor must have a "
+                f"single JaggedDim, but got {matrices}."
+            )
+        else:
+            max_value = jagged_int_var.jagged_dims()[0].max_value()
+            if max_value != vectors.shape()[2]:
+                raise RuntimeError(
+                    "Upper bound (max_value) of the jagged dim in matrices "
+                    "must be equal to the last dim N in vectors, but got "
+                    f"{max_value=} != {vectors.shape()[2].value()=}."
+                )
+
+        self._attrs["inputs"] = [vectors, matrices]
+        self._set_depth()
+        output_shape = self._infer_shape(vectors, matrices)
+        output = Tensor(output_shape, src_ops={self}, dtype=vectors.dtype())
+
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
index 33a68ec0f..7b32095b0 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rcr_n1.py
@@ -27,9 +27,9 @@
 This kernel computes C = alpha * A @ B
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..gemm_universal import bmm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -83,7 +83,7 @@ def __call__(self, a: Tensor, b: Tensor, alpha: float = 1.0) -> Tensor:
         self._attrs["alpha"] = alpha
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
diff --git a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
index 6fd8c2800..951727081 100644
--- a/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_special/bmm_rrr_k1_tanh.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import IntVar, Tensor
-from ..gemm_universal import bmm_rrr
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.compiler.ops.gemm_universal import bmm_rrr
 
 # pylint: disable=C0103,W0221,C0200
 
@@ -73,7 +73,7 @@ def __call__(self, a: Tensor, b: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [a, b]
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
index 3e28087f3..bd96f9b8e 100644
--- a/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
+++ b/python/aitemplate/compiler/ops/gemm_special/gemm_rrr_small_nk.py
@@ -25,8 +25,8 @@
 C: [M, N]
 """
 
-from ...base import IntImm, Tensor
-from ..gemm_universal import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -103,7 +103,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._attrs["inputs"] = [a, b]
         self._set_depth()
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         # self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/__init__.py b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
index b9e2f8e5c..24ae1ef82 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/__init__.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/__init__.py
@@ -13,52 +13,108 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bmm_ccr import bmm_ccr
-from .bmm_ccr_add import bmm_ccr_add
-from .bmm_crr import bmm_crr
-from .bmm_crr_add import bmm_crr_add
-from .bmm_rcr import bmm_rcr
-from .bmm_rcr_permute import bmm_rcr_permute
-from .bmm_rrr import bmm_rrr
-from .bmm_rrr_add import bmm_rrr_add
-from .bmm_rrr_permute import bmm_rrr_permute
-from .bmm_softmax_bmm import bmm_softmax_bmm
-from .bmm_softmax_bmm_permute import bmm_softmax_bmm_permute
-from .gemm_rcr import gemm_rcr
-from .gemm_rcr_bias import gemm_rcr_bias
-from .gemm_rcr_bias_add import gemm_rcr_bias_add
-from .gemm_rcr_bias_add_add import gemm_rcr_bias_add_add
-from .gemm_rcr_bias_add_add_relu import gemm_rcr_bias_add_add_relu
-from .gemm_rcr_bias_add_relu import gemm_rcr_bias_add_relu
-from .gemm_rcr_bias_fast_gelu import gemm_rcr_bias_fast_gelu
-from .gemm_rcr_bias_gelu import gemm_rcr_bias_gelu
-from .gemm_rcr_bias_hardswish import gemm_rcr_bias_hardswish
-from .gemm_rcr_bias_mul import gemm_rcr_bias_mul
-from .gemm_rcr_bias_mul_add import gemm_rcr_bias_mul_add
-from .gemm_rcr_bias_mul_tanh import gemm_rcr_bias_mul_tanh
-from .gemm_rcr_bias_permute import gemm_rcr_bias_permute
-from .gemm_rcr_bias_relu import gemm_rcr_bias_relu
-from .gemm_rcr_bias_sigmoid import gemm_rcr_bias_sigmoid
-from .gemm_rcr_bias_sigmoid_mul import gemm_rcr_bias_sigmoid_mul
-from .gemm_rcr_bias_sigmoid_mul_tanh import gemm_rcr_bias_sigmoid_mul_tanh
-from .gemm_rcr_bias_swish import gemm_rcr_bias_swish
-from .gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
-from .gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
-from .gemm_rcr_permute import gemm_rcr_permute
-from .gemm_rrr import gemm_rrr
-from .gemm_rrr_bias import gemm_rrr_bias
-from .gemm_rrr_bias_permute import gemm_rrr_bias_permute
-from .gemm_rrr_permute import gemm_rrr_permute
-from .group_gemm_rcr import group_gemm_rcr
-from .group_gemm_rcr_bias import group_gemm_rcr_bias
-from .group_gemm_rcr_bias_relu import group_gemm_rcr_bias_relu
-from .group_gemm_rcr_bias_sigmoid import group_gemm_rcr_bias_sigmoid
-from .perm021fc_ccr import perm021fc_ccr
-from .perm021fc_ccr_bias import perm021fc_ccr_bias
-from .perm021fc_ccr_bias_permute import perm021fc_ccr_bias_permute
-from .perm021fc_crc import perm021fc_crc
-from .perm021fc_crc_bias import perm021fc_crc_bias
-from .perm102_bmm_rcr import perm102_bmm_rcr
-from .perm102_bmm_rcr_bias import perm102_bmm_rcr_bias
-from .perm102_bmm_rrr import perm102_bmm_rrr
-from .perm102_bmm_rrr_bias import perm102_bmm_rrr_bias
+from aitemplate.compiler.ops.gemm_universal.bmm_rcr_permute import bmm_rcr_permute
+from aitemplate.compiler.ops.gemm_universal.bmm_rrr_permute import bmm_rrr_permute
+from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm import bmm_softmax_bmm
+from aitemplate.compiler.ops.gemm_universal.bmm_softmax_bmm_permute import (
+    bmm_softmax_bmm_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+)
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx_add import (
+    bmm_ccc_add,
+    bmm_ccr_add,
+    bmm_crc_add,
+    bmm_crr_add,
+    bmm_rcc_add,
+    bmm_rcr_add,
+    bmm_rrc_add,
+    bmm_rrr_add,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add import gemm_rcr_bias_add
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_add import (
+    gemm_rcr_bias_add_add,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_add_relu import (
+    gemm_rcr_bias_add_add_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_add_relu import (
+    gemm_rcr_bias_add_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_fast_gelu import (
+    gemm_rcr_bias_fast_gelu,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_gelu import gemm_rcr_bias_gelu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_hardswish import (
+    gemm_rcr_bias_hardswish,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul import gemm_rcr_bias_mul
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul_add import (
+    gemm_rcr_bias_mul_add,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_mul_tanh import (
+    gemm_rcr_bias_mul_tanh,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_permute import (
+    gemm_rcr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_relu import gemm_rcr_bias_relu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid import (
+    gemm_rcr_bias_sigmoid,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid_mul import (
+    gemm_rcr_bias_sigmoid_mul,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_sigmoid_mul_tanh import (
+    gemm_rcr_bias_sigmoid_mul_tanh,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_swish import (
+    gemm_rcr_bias_swish,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_tanh import gemm_rcr_bias_tanh
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_fast_gelu import gemm_rcr_fast_gelu
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_permute import gemm_rcr_permute
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_permute_elup1 import (
+    gemm_rcr_permute_elup1,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_bias import gemm_rrr_bias
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_bias_permute import (
+    gemm_rrr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr_permute import gemm_rrr_permute
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr import group_gemm_rcr
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias import (
+    group_gemm_rcr_bias,
+)
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias_relu import (
+    group_gemm_rcr_bias_relu,
+)
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr_bias_sigmoid import (
+    group_gemm_rcr_bias_sigmoid,
+)
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr import perm021fc_ccr
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias_permute import (
+    perm021fc_ccr_bias_permute,
+)
+from aitemplate.compiler.ops.gemm_universal.perm021fc_crc import perm021fc_crc
+from aitemplate.compiler.ops.gemm_universal.perm021fc_crc_bias import perm021fc_crc_bias
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rcr import perm102_bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rcr_bias import (
+    perm102_bmm_rcr_bias,
+)
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rrr import perm102_bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.perm102_bmm_rrr_bias import (
+    perm102_bmm_rrr_bias,
+)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
index 6cfbf89c3..4a8a8e430 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm.py
@@ -18,9 +18,52 @@
 
 # pylint: disable=C0103,W0223
 
-from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.dtype import is_same_dtype
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.gemm_common import gemm
 
-from .gemm_common import gemm
+
+def is_valid_inputs(output_shapes, c_shapes):
+    """
+    Used by bmm_xxx_add ops to check whether elementwise ops
+    can be fused to the bmm op via epilogue fusion. So far,
+    only add ops are supported.
+    """
+    msg = ""
+    if output_shapes == c_shapes:
+        return True, msg
+
+    def _squeeze_leading_1s(shapes):
+        out = []
+        if len(shapes) == 0:
+            return out
+        i = 0
+        for shape in shapes:
+            if not isinstance(shape, IntImm):
+                break
+            if shape.value() != 1:
+                break
+            i = i + 1
+
+        out = shapes[i:]
+        if len(out) == 0:
+            out.append(shapes[-1])
+        return out
+
+    msg = (
+        f"C can't be broadcast to the bmm output."
+        f"Output shapes: {output_shapes}, C shapes: {c_shapes}"
+    )
+    bias_shapes = _squeeze_leading_1s(c_shapes)
+    if len(bias_shapes) >= len(output_shapes):
+        return False, msg
+
+    for o_shape, c_shape in zip(reversed(output_shapes), reversed(bias_shapes)):
+        if o_shape != c_shape:
+            return False, msg
+
+    return True, ""
 
 
 class bmm(gemm):
@@ -65,3 +108,12 @@ def _sanity_check(self, a: Tensor, b: Tensor):
             raise RuntimeError(
                 "bmm operand A and B both have 2 dimensions! Use gemm instead."
             )
+        if not is_same_dtype(a.dtype(), b.dtype()):
+            raise RuntimeError(
+                "gemm operand A and B should have the same data type! Current A: {atype}, B: {btype}.".format(
+                    atype=a.dtype(), btype=b.dtype()
+                )
+            )
+
+    def _invert_exec_key(self, key):
+        return common.gemm_inverse_key_func(key)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
deleted file mode 100644
index 45d6ee06d..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
-"""
-
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_ccr(bmm):
-    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
-
-    This operator is equivalent to following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
-    """
-
-    def __init__(self):
-        """Constructor for bmm_ccr"""
-        super().__init__()
-        self._attrs["op"] = "bmm_ccr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, k)
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-1], b.shape()[-2]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, K, M) * (B, N, K) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
deleted file mode 100644
index 5dce354fd..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_ccr_add.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-from ...base import Tensor
-from . import bmm_ccr
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_ccr_add(bmm_ccr):
-    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
-
-    This operator is equivalent to following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        WT = torch.transpose(W_pt, 2, 1)
-        Y_pt = torch.bmm(XT, WT)
-        Y_pt = Y_pt + D_pt
-    """
-
-    def __init__(self):
-        """Constructor for bmm_ccr_add"""
-        super().__init__()
-        self._attrs["op"] = "bmm_ccr_add"
-        self._attrs["has_d"] = True
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_ccr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a.transpose(2, 1), b.transpose(2, 1)) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor in shape (B, K, M)
-        b : Tensor
-            Tensor in shape (B, N, K)
-        c : Tensor
-            Tensor in shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor in shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
deleted file mode 100644
index 35915f4a5..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
-"""
-
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_crr(bmm):
-    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_crr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, n)
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-1], b.shape()[-1]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, K, M) * (B, K, N) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
deleted file mode 100644
index b03bb183b..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_crr_add.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-from ...base import Tensor
-from . import bmm_crr
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_crr_add(bmm_crr):
-    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
-
-    This operator is equivalent to the following PyTorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-        Y_pt = Y_pt + D_pt
-
-    """
-
-    def __init__(self):
-        """Constructor for bmm_crr_add"""
-        super().__init__()
-        self._attrs["op"] = "bmm_crr_add"
-        self._attrs["has_d"] = True
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_crr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a.transpose(2, 1), b) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor in shape (B, K, M)
-        b : Tensor
-            Tensor in shape (B, K, N)
-        c : Tensor
-            Tensor in shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor in shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
deleted file mode 100644
index 35c9e717c..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
-"""
-
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_rcr(bmm):
-    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-
-        XT = torch.transpose(X_pt, 2, 1)
-        Y_pt = torch.bmm(XT, W_pt)
-
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rcr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-2], b.shape()[-2]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, M, K) * (B, N, K) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
index 5bc4b4baa..a11ff5d34 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rcr_permute.py
@@ -18,10 +18,10 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import bmm_rcr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -88,7 +88,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
deleted file mode 100644
index 82c865d13..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
-"""
-
-from ...base import Tensor
-from . import gemm_common as common
-from .bmm import bmm
-
-# pylint: disable=C0103, W0223, W0221, W0613
-
-
-class bmm_rrr(bmm):
-    """BBatch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-
-        Y_pt = torch.bmm(X_pt, W_pt)
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rrr"
-
-        def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
-
-        self._attrs["f_ab_alignment"] = cal_align_ab
-
-    def _infer_shapes(self, a: Tensor, b: Tensor):
-        batch_size = self._get_batch_size(a, b)
-        return [batch_size, a.shape()[-2], b.shape()[-1]]
-
-    def _extract_dims(self, for_profiling=False):
-        # (B, M, K) * (B, K, N) = (B, M, N)
-        a_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 0
-        )
-        b_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.INPUT, 1
-        )
-        output_shapes = common.extract_shape_from_accessor(
-            self._attrs, common.Source.OUTPUT, 0
-        )
-
-        B_dim = common.create_input_batch_diminfo(
-            [a_shapes, b_shapes], [0, 0], output_shapes[0]
-        )
-        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
-
-        dim_info_dict = {
-            "B": B_dim,
-            "M": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 2
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=1),
-            ],
-            "N": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 1
-                ),
-                common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=2),
-            ],
-            "K": [
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=0, dim_idx=len(a_shapes) - 1
-                ),
-                common.DimInfo(
-                    common.Source.INPUT, tensor_idx=1, dim_idx=len(b_shapes) - 2
-                ),
-            ],
-        }
-
-        return dim_info_dict
-
-    def _invert_exec_key(self, key):
-        return common.gemm_inverse_key_func(key)
-
-    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
-        def fbuild_cmd(exec_key):
-            B, M, N, K = self._invert_exec_key(exec_key)
-            cmd = []
-            cmd.append(B)  # m
-            cmd.append(M)  # m
-            cmd.append(N)  # n
-            cmd.append(K)  # k
-            return cmd
-
-        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
deleted file mode 100644
index 6e8b7ab28..000000000
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_add.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
-"""
-
-from aitemplate.compiler.tensor_accessor import TensorAccessor
-
-from ...base import Tensor
-from . import bmm_rrr
-
-# pylint: disable=C0103, W0223
-
-
-class bmm_rrr_add(bmm_rrr):
-    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
-
-    This operator is equivalent to the following pytorch code:
-
-    .. highlight:: python
-    .. code-block:: python
-
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
-
-        Y_pt = torch.bmm(X_pt, W_pt) + D_pt
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._attrs["op"] = "bmm_rrr_add"
-        self._attrs["has_d"] = True
-
-    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
-        """Call bmm_rrr_add with tensors a, b, c
-
-        Equivalent to the following PyTorch code:
-
-        .. highlight:: python
-        .. code-block:: python
-
-            y = bmm(a, b) + c
-
-        Parameters
-        ----------
-        a : Tensor
-            Tensor with shape (B, M, K)
-        b : Tensor
-            Tensor with shape (B, K, N)
-        c : Tensor
-            Tensor with shape (B, M, N)
-
-        Returns
-        -------
-        Tensor
-            Tensor with shape (B, M, N)
-        """
-        output = super().__call__(a, b)
-        self._attrs["inputs"].append(c)
-        self._attrs["input_accessors"] = [
-            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
-        ]
-        self._set_depth()
-        return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
index 5bba36489..9d52103b8 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_rrr_permute.py
@@ -18,10 +18,10 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import bmm_rrr
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -88,14 +88,14 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
         if self._attrs["layout"] == "Permute4DBMM_0213":
             b, m, n = output_shape
             d1 = self._attrs["shape"][0]
-            output_shape = [b.value() // d1, m, d1, n]
+            output_shape = [-1, m, d1, n]
             self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
index 5ec16955f..113807ba3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm.py
@@ -16,10 +16,10 @@
 BMM_RCR + Softmax + BMM_RRR Specialization
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -60,7 +60,7 @@ def __init__(self, scale=1.0):
         self._attrs["scale"] = scale
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -153,7 +153,7 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, b1)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
index 5beedb6c2..da989ce62 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_softmax_bmm_permute.py
@@ -17,11 +17,11 @@
 """
 from typing import Tuple
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -70,7 +70,7 @@ def __init__(self, shape: Tuple[int], scale=1.0, causal=False, layout="0213"):
         self._attrs["layout"] = "Permute4DBMM_{}".format(layout)
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -168,21 +168,20 @@ def __call__(self, a: Tensor, b: Tensor, b1: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, b1)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
         if self._attrs["layout"] == "Permute4DBMM_0213":
             b, m, o = output_shape
             d1 = self._attrs["shape"][0]
-            output_shape = [b.value() // d1, m, d1, o]
+            output_shape = [-1, m, d1, o]
             self._extract_epilogue_alignment(output_shape)
             return reshape()(output, output_shape)
         else:
             raise NotImplementedError(
                 "{} is not implemented!".format(self._attrs["layout"])
             )
-        return output
 
     def _get_op_attributes(self):
         return {
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
new file mode 100644
index 000000000..6f0caace0
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx.py
@@ -0,0 +1,307 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+
+
+class bmm_xxx(bmm):
+    """Batch GEMM specialization"""
+
+    def __init__(self, a_layout, b_layout, c_layout):
+        super().__init__()
+        self._attrs["op"] = f"bmm_{a_layout}{b_layout}{c_layout}"
+        self.a_layout = a_layout
+        self.b_layout = b_layout
+        self.c_layout = c_layout
+
+        self.a_is_column_major = int(self.a_layout == "c")
+        self.b_is_column_major = int(self.b_layout == "c")
+        self.c_is_column_major = int(self.c_layout == "c")
+
+        def cal_align_ab(m, n, k):
+            return common.default_align_ab(
+                self._get_a_leading_dim(m, k),
+                self._get_b_leading_dim(n, k),
+                self._attrs["inputs"][0].dtype(),
+            )
+
+        self._attrs["f_ab_alignment"] = cal_align_ab
+
+    def _infer_shapes(self, a: Tensor, b: Tensor):
+        batch_size = self._get_batch_size(a, b)
+        m = a.shape()[self._get_m_idx_in_a(a.shape())]
+        n = b.shape()[self._get_n_idx_in_b(b.shape())]
+        return [batch_size, *self._get_output_shape(m, n)]
+
+    def _extract_dims(self, for_profiling=False):
+        # C = A * B
+        # A shape is (B, M, K) for row-major layout and (B, K, M) for column-major layout
+        # B shape is (B, K, N) for row-major layout and (B, N, K) for column-major layout
+        # C shape is (B, M, N) for row-major layout and (B, N, M) for column-major layout
+        a_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 0
+        )
+        b_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.INPUT, 1
+        )
+        output_shapes = common.extract_shape_from_accessor(
+            self._attrs, common.Source.OUTPUT, 0
+        )
+
+        B_dim = common.create_input_batch_diminfo(
+            [a_shapes, b_shapes], [0, 0], output_shapes[0]
+        )
+        B_dim.append(common.DimInfo(common.Source.OUTPUT, tensor_idx=0, dim_idx=0))
+
+        dim_info_dict = {
+            "B": B_dim,
+            "M": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_m_idx_in_a(a_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_m_idx_in_c(),
+                ),
+            ],
+            "N": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=1,
+                    dim_idx=self._get_n_idx_in_b(b_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.OUTPUT, tensor_idx=0, dim_idx=self._get_n_idx_in_c()
+                ),
+            ],
+            "K": [
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=0,
+                    dim_idx=self._get_k_idx_in_a(a_shapes),
+                ),
+                common.DimInfo(
+                    common.Source.INPUT,
+                    tensor_idx=1,
+                    dim_idx=self._get_k_idx_in_b(b_shapes),
+                ),
+            ],
+        }
+
+        return dim_info_dict
+
+    def _get_a_leading_dim(self, m, k):
+        return [k, m][self.a_is_column_major]
+
+    def _get_b_leading_dim(self, n, k):
+        return [n, k][self.b_is_column_major]
+
+    def _get_m_idx_in_a(self, a_shapes):
+        return len(a_shapes) - 2 + self.a_is_column_major
+
+    def _get_m_idx_in_c(self):
+        return 1 + self.c_is_column_major
+
+    def _get_n_idx_in_b(self, b_shapes):
+        return len(b_shapes) - 1 - self.b_is_column_major
+
+    def _get_n_idx_in_c(self):
+        return 2 - self.c_is_column_major
+
+    def _get_k_idx_in_a(self, a_shapes):
+        return len(a_shapes) - 1 - self.a_is_column_major
+
+    def _get_k_idx_in_b(self, b_shapes):
+        return len(b_shapes) - 2 + self.b_is_column_major
+
+    def _get_output_shape(self, m, n):
+        if self.c_is_column_major:
+            return [n, m]
+        return [m, n]
+
+    def _gen_profile_cmd(self, profiler_prefix, cfg, exec_key):
+        def fbuild_cmd(exec_key):
+            B, M, N, K = self._invert_exec_key(exec_key)
+            cmd = []
+            cmd.append(B)  # m
+            cmd.append(M)  # m
+            cmd.append(N)  # n
+            cmd.append(K)  # k
+            return cmd
+
+        return super()._gen_profile_cmd(profiler_prefix, cfg, exec_key, fbuild_cmd)
+
+
+class bmm_ccr(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+    """
+
+    def __init__(self):
+        super().__init__("c", "c", "r")
+
+
+class bmm_rrr(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor]
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+    """
+
+    def __init__(self):
+        super().__init__("r", "r", "r")
+
+
+class bmm_crr(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__("c", "r", "r")
+
+
+class bmm_rcr(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+
+    """
+
+    def __init__(self):
+        super().__init__("r", "c", "r")
+
+
+class bmm_ccc(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[ColMajor].
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt.transpose(2, 1))
+        Y_pt = torch.transpose(YT, 2, 1)
+    """
+
+    def __init__(self):
+        super().__init__("c", "c", "c")
+
+
+class bmm_rrc(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[ColMajor]
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        YT = torch.bmm(X_pt, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+    """
+
+    def __init__(self):
+        super().__init__("r", "r", "c")
+
+
+class bmm_crc(bmm_xxx):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[ColMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+
+    """
+
+    def __init__(self):
+        super().__init__("c", "r", "c")
+
+
+class bmm_rcc(bmm_xxx):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[ColMajor].
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = torch.transpose(YT, 2, 1)
+
+    """
+
+    def __init__(self):
+        super().__init__("r", "c", "c")
diff --git a/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
new file mode 100644
index 000000000..8d311b6e7
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/bmm_xxx_add.py
@@ -0,0 +1,397 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal.bmm import (
+    is_valid_inputs as bmm_is_valid_inputs,
+)
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+    bmm_xxx,
+)
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+
+class bmm_xxx_add(bmm_xxx):
+    """Batch GEMM specialization with Add.
+    C can be the same size as the output or be broadcast as bias.
+    """
+
+    def __init__(self, a_layout, b_layout, c_layout):
+        super().__init__(a_layout, b_layout, c_layout)
+        self._attrs["op"] = f"bmm_{a_layout}{b_layout}{c_layout}_add"
+        self._attrs["has_d"] = True
+
+    def __call__(self, a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        """Call bmm_rrr_add with tensors a, b, c"""
+        output = super().__call__(a, b)
+        self._attrs["inputs"].append(c)
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
+        self._set_depth()
+        return output
+
+    def is_valid_inputs_unspecialized(self, A: Tensor, B: Tensor, C: Tensor):
+        # For base bmm_xxx_add class this method can't be static,
+        # since the class doesn't know about the layout (the object does).
+        output_shapes = bmm_xxx(
+            self.a_layout, self.b_layout, self.c_layout
+        )._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return bmm_is_valid_inputs(output_shapes, c_shapes)
+
+    @classmethod
+    def is_valid_inputs(cls, A: Tensor, B: Tensor, C: Tensor):
+        """
+        This method should only be called from subclasses of bmm_xxx_add, since
+        _SpecializedBase is defined there. For the parent class bmm_xxx_add itself
+        call is_valid_inputs_unspecialized instead.
+        """
+        if not hasattr(cls, "_SpecializedBase"):
+            raise NotImplementedError(
+                "Call bmm_xxx_add.is_valid_inputs_unspecialized instead of bmm_xxx_add.is_valid_inputs. The latter is only defined for child classes of bmm_xxx_add."
+            )
+        output_shapes = cls._SpecializedBase()._infer_shapes(A, B)
+        c_shapes = C.shape()
+        return bmm_is_valid_inputs(output_shapes, c_shapes)
+
+
+class bmm_crr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, K, N)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+
+    """
+
+    _SpecializedBase = bmm_crr
+
+    def __init__(self):
+        """Constructor for bmm_crr_add"""
+        super().__init__("c", "r", "r")
+
+
+class bmm_rcr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(X_pt, WT)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_rcr
+
+    def __init__(self):
+        """Constructor for bmm_rcr_add"""
+        super().__init__("r", "c", "r")
+
+
+class bmm_ccr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        WT = torch.transpose(W_pt, 2, 1)
+        Y_pt = torch.bmm(XT, WT)
+        Y_pt = Y_pt + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_ccr
+
+    def __init__(self):
+        """Constructor for bmm_ccr_add"""
+        super().__init__("c", "c", "r")
+
+
+class bmm_rrr_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[RowMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, M, N).cuda().half()
+
+        Y_pt = torch.bmm(X_pt, W_pt) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with shape (B, M, K)
+        b : Tensor
+            Tensor with shape (B, K, N)
+        c : Tensor
+            Tensor with shape (B, M, N)
+
+        Returns
+        -------
+        Tensor
+            Tensor with shape (B, M, N)
+    """
+
+    _SpecializedBase = bmm_rrr
+
+    def __init__(self):
+        super().__init__("r", "r", "r")
+
+
+class bmm_crc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[RowMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        YT = torch.bmm(XT, W_pt)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, K, N)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_crc
+
+    def __init__(self):
+        """Constructor for bmm_crc_add"""
+        super().__init__("c", "r", "c")
+
+
+class bmm_rcc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[ColMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        WT = torch.transpose(W_pt, 2, 1)
+        YT = torch.bmm(X_pt, WT)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, M, K)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_rcc
+
+    def __init__(self):
+        """Constructor for bmm_rcc_add"""
+        super().__init__("r", "c", "c")
+
+
+class bmm_ccc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[ColMajor], B[ColMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, K, M).cuda().half()
+        W_pt = torch.randn(B, N, K).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+
+        XT = torch.transpose(X_pt, 2, 1)
+        WT = torch.transpose(W_pt, 2, 1)
+        YT = torch.bmm(XT, WT)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor in shape (B, K, M)
+        b : Tensor
+            Tensor in shape (B, N, K)
+        c : Tensor
+            Tensor in shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor in shape (B, N, M)
+
+    """
+
+    _SpecializedBase = bmm_ccc
+
+    def __init__(self):
+        """Constructor for bmm_ccc_add"""
+        super().__init__("c", "c", "c")
+
+
+class bmm_rrc_add(bmm_xxx_add):
+    """Batch GEMM specialization for A[RowMajor], B[RowMajor], C[ColMajor] with Add.
+    C can be the same size as the output or be broadcast as bias.
+
+    This operator is equivalent to the following PyTorch code:
+
+    .. highlight:: python
+    .. code-block:: python
+
+        X_pt = torch.randn(B, M, K).cuda().half()
+        W_pt = torch.randn(B, K, N).cuda().half()
+        D_pt = torch.randn(B, N, M).cuda().half()
+        YT = torch.bmm(X_pt, W_pt)
+        Y_pt = YT.transpose(2, 1) + D_pt
+
+    __call__(a: Tensor, b: Tensor, c: Tensor) -> Tensor:
+        Parameters
+        ----------
+        a : Tensor
+            Tensor with shape (B, M, K)
+        b : Tensor
+            Tensor with shape (B, K, N)
+        c : Tensor
+            Tensor with shape (B, N, M)
+
+        Returns
+        -------
+        Tensor
+            Tensor with shape (B, N, M)
+    """
+
+    _SpecializedBase = bmm_rrc
+
+    def __init__(self):
+        super().__init__("r", "r", "c")
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
index 1cfa4382e..9e9f5e7d3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_common.py
@@ -16,6 +16,7 @@
 Common functions/classes for GEMM ops
 """
 import itertools
+import logging
 import math
 import os
 import re
@@ -24,23 +25,37 @@
 from enum import Enum
 from hashlib import sha1
 from operator import itemgetter
-from typing import Any, Dict, List, Union
+from time import sleep
+from typing import Any, Callable, Dict, List, Union
 
 import jinja2
 
-from aitemplate.backend.profiler_runner import ProfileResult
+from aitemplate import backend
+from aitemplate.backend import registry
 
-from .... import backend
-from ....backend import registry
-from ....utils import logger
-from ....utils.alignment import find_max_alignment
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ...tensor_accessor import TensorAccessor
-from .cache_entry import GemmQueryEntry, GemmRecordEntry
+from aitemplate.backend.profiler_runner import ProfileResult
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.dtype import is_same_dtype
+from aitemplate.compiler.ops.gemm_universal.cache_entry import (
+    GemmQueryEntry,
+    GemmRecordEntry,
+)
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import alignment, environ
 
 # pylint: disable=C0103,R1711,W0102,W0221,E1120
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def split_k_result_getter(result):
     return result[1].duration
 
@@ -153,15 +168,9 @@ def gemm_inverse_key_func(key):
     return [int(x) for x in tmp]
 
 
-def default_align_ab(a, b):
+def default_align_ab(a, b, dtype):
     ab = math.gcd(a, b)
-    if ab % 8 == 0:
-        return 8
-    if ab % 4 == 0:
-        return 4
-    if ab % 2 == 0:
-        return 2
-    return 1
+    return alignment.find_max_alignment(ab, dtype)
 
 
 def _to_list(elem):
@@ -171,6 +180,22 @@ def _to_list(elem):
         return [elem]
 
 
+def _check_with_retries(
+    condition: Callable[[], bool],
+    max_attempts: int = 3,
+    delay_seconds: int = 5,
+) -> bool:
+    """Check a condition with retries."""
+    attempts = 0
+    while True:
+        if condition():
+            return True
+        attempts += 1
+        if attempts >= max_attempts:
+            return False
+        sleep(delay_seconds)
+
+
 class gemm(Operator):
     """Base gemm operators"""
 
@@ -211,8 +236,8 @@ def _extract_epilogue_alignment(
         else:
             shape = epilogue_dim._attrs["values"][0]
 
-        self._attrs["epilogue_alignment"] = find_max_alignment(shape)
-        return
+        dtype = self._attrs["inputs"][0].dtype()
+        self._attrs["epilogue_alignment"] = alignment.find_max_alignment(shape, dtype)
 
     def _infer_shapes(self, a: Tensor, b: Tensor):
         raise NotImplementedError("_infer_shapes() is not implemented!")
@@ -297,7 +322,7 @@ def _extract_exec_path(self, dynamic_profiling_strategy):
         """
 
         dim_info_dict: Dict[str, List[DimInfo]] = self._extract_dims()
-        dim_dict: Dict[str, IntVar] = {}
+        dim_dict: Dict[str, List[IntVar]] = {}
         for name, dim_infos in dim_info_dict.items():
             dim_info = None
             for d in dim_infos:
@@ -380,11 +405,13 @@ def _get_profiler_filename(self):
         generate a filename for a profiler that benchmarks multiple GEMM instances
         """
         target = backend.target.Target.current()
+
         op_type = self._attrs["op"]
         all_op_names = list(self._attrs["op_instance"].keys())
         encoded_str = sha1((";".join(all_op_names)).encode("utf-8")).hexdigest()
-        # we don't use cache
+
         if target.use_dummy_profiling_results():
+            # we don't use cache
             return f"{op_type}_{encoded_str}"
         else:
             cache_ver = target.get_profile_cache_version("gemm")
@@ -398,6 +425,9 @@ def _should_build_profiler(
         entry for this gemm instance, we update this gemm op's
         relevant attributes with the cached result and return False.
         """
+        # We are forced to use the cache, so we skip building profilers.
+        if environ.force_profiler_cache():
+            return False
         target = backend.target.Target.current()
 
         build_profiler = True
@@ -410,10 +440,14 @@ def _should_build_profiler(
             for wkl in workloads:
                 exec_entry_sha1 = sha1(wkl.encode("utf-8")).hexdigest()
                 query = GemmQueryEntry(
-                    dtype_a=tmp_op.A.element.value,
-                    dtype_b=tmp_op.B.element.value,
-                    dtype_c=tmp_op.C.element.value,
-                    dtype_acc=tmp_op.accumulator_type().value,
+                    # 1 is subtracted from the type enum values for consistency with the existing
+                    # cache databases; due to the "void" type being added to the DataType enum as
+                    # the very first enum member (and shifting the values of other enum members) in
+                    # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+                    dtype_a=tmp_op.A.element.value - 1,
+                    dtype_b=tmp_op.B.element.value - 1,
+                    dtype_c=tmp_op.C.element.value - 1,
+                    dtype_acc=tmp_op.accumulator_type().value - 1,
                     major_a=tmp_op.A.layout.value,
                     major_b=tmp_op.B.layout.value,
                     major_c=tmp_op.C.layout.value,
@@ -425,8 +459,7 @@ def _should_build_profiler(
                 )
                 cache_value = target.query_profile_cache("gemm", query.__dict__)
                 if cache_value is not None and not target.force_profile():
-                    logger.info(
-                        __name__,
+                    _LOGGER.info(
                         f'Load profiling result for {self._attrs["name"]} '
                         f"from cache: {cache_value}",
                     )
@@ -458,7 +491,7 @@ def gen_profiler(
             target=target.name(), op=self._attrs["op"]
         )
         func = registry.get(func_key)
-        func(self._attrs)
+        func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
 
         # init exec path
         self._extract_exec_path(dynamic_profiling_strategy)
@@ -480,23 +513,31 @@ def gen_profiler(
             output_shape = self._attrs["output_accessors"][0].original_shapes
             self._extract_epilogue_alignment(output_shape, dynamic_profiling_strategy)
 
+        if not self._attrs["op_instance"]:
+            raise RuntimeError(
+                f"No GEMM op instances were generated for {self._attrs['op']}."
+            )
+
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
-            {
-                k: v
-                for k, v in self._attrs["op_instance"].items()
-                if filter_func(k, self._attrs, ab_alignments[0])
-            }
+            (k, v)
+            for k, v in self._attrs["op_instance"].items()
+            if filter_func(k, self._attrs, ab_alignments[0])
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
             f"number of generated kernels from {len(self._attrs['op_instance'])} "
             f"to {len(new_op_instance)}",
         )
         self._attrs["op_instance"] = new_op_instance
 
+        if not self._attrs["op_instance"]:
+            raise RuntimeError(
+                f"No GEMM op instances are left after filtering for {self._attrs['op']}. "
+                "This is probably due to incompatible alignment requirements."
+            )
+
         build_profiler = self._should_build_profiler(workloads, new_op_instance)
         if build_profiler:
             # generate profiler
@@ -511,7 +552,7 @@ def gen_profiler(
                     self._extract_dims(for_profiling=True),
                 )
             profiler_filename = self._get_profiler_filename()
-            logger.info(__name__, f"generating {profiler_filename=}")
+            _LOGGER.info(f"generating {profiler_filename=}")
             return func(
                 self._attrs,
                 workdir,
@@ -523,8 +564,13 @@ def _gen_profile_cmd(
         self, profiler_prefix, profiler_filename, exec_key, fbuild_cmd
     ):
         exe_path = os.path.join(profiler_prefix, profiler_filename)
-        if not os.access(exe_path, os.X_OK):
+        if not _check_with_retries(
+            condition=lambda: os.access(exe_path, os.X_OK),
+            max_attempts=3,
+            delay_seconds=5,
+        ):
             raise RuntimeError("Profiler %s is not executable" % exe_path)
+
         cmd_args = fbuild_cmd(exec_key)
         cmd = [exe_path]
         # mnk
@@ -549,8 +595,7 @@ def _split_k_search_space(self, M, N, K):
         if low_range == 1:
             low_range += 1
         space += list(range(low_range, high_range, 2))
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"profiling split-k for gemm instance M={M}, N={N}, K={K} in {set(space)}",
         )
         return set(space)
@@ -567,14 +612,18 @@ def _get_ab_alignment(self, exec_key):
             # exec_key may contain batch dimension, which we don't care here
             m, n, k = gemm_inverse_key_func(exec_key)[-3:]
             ab_alignment = self._attrs["f_ab_alignment"](m, n, k)
-            # FIXME: for dtype != float16
-            if ab_alignment == 1:
+            if not alignment.valid_alignment(
+                ab_alignment, self._attrs["inputs"][0].dtype()
+            ):
                 raise RuntimeError(
-                    "A / B alignment == 1 is not supported! " f"m: {m}, n: {n}, k: {k}."
+                    f"A / B {ab_alignment=} is not valid! The last dimension of each input tensor needs to be divisible by 2."
+                    f"m: {m}, n: {n}, k: {k}."
                 )
         return ab_alignment
 
-    def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
+    def _profile_single_workload(
+        self, profiler_prefix, exec_key, profiler_runner, force_cache
+    ):
         """
         Schedule profilers for given profiler path and gemm shape (exec_key)
         or get the result from cache
@@ -594,10 +643,14 @@ def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
         # have a cache entry for the problem size before gen_profiler, we will
         # setup exec_path correctly in gen_profiler, so we won't get here at all.
         query = GemmQueryEntry(
-            dtype_a=tmp_op.A.element.value,
-            dtype_b=tmp_op.B.element.value,
-            dtype_c=tmp_op.C.element.value,
-            dtype_acc=tmp_op.accumulator_type().value,
+            # 1 is subtracted from the type enum values for consistency with the existing
+            # cache databases; due to the "void" type being added to the DataType enum as
+            # the very first enum member (and shifting the values of other enum members) in
+            # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+            dtype_a=tmp_op.A.element.value - 1,
+            dtype_b=tmp_op.B.element.value - 1,
+            dtype_c=tmp_op.C.element.value - 1,
+            dtype_acc=tmp_op.accumulator_type().value - 1,
             major_a=tmp_op.A.layout.value,
             major_b=tmp_op.B.layout.value,
             major_c=tmp_op.C.layout.value,
@@ -609,8 +662,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
         )
         cache_value = target.query_profile_cache("gemm", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 f'Load profiling result for {self._attrs["name"]} '
                 f"from cache: {cache_value}",
             )
@@ -618,13 +670,19 @@ def _profile_single_workload(self, profiler_prefix, exec_key, profiler_runner):
             self._attrs["workspace"] = max(self._attrs["workspace"], cache_value[1])
             self._attrs["split_k"] = cache_value[2]
             return
+        if cache_value is None and force_cache:
+            op_type = self._attrs["op"]
+            raise RuntimeError(
+                "force_cache is enabled but we could not find the following cache ",
+                f"available on device {target._arch=}, {op_type=}, {exec_entry_sha1=}",
+            )
         if target.use_dummy_profiling_results():
             op_type = self._attrs["op"]
             raise Exception(
                 "This is a CI run but we could not find the following cache ",
                 f"available on device {target._arch}\n",
                 f"{op_type} {exec_entry_sha1}.\n",
-                "To bypass, you need to make it available in the db table.",
+                "Please adjust target.select_minimal_algo function.",
             )
         if target.name() == "rocm":
             op_type = self._attrs["op"]
@@ -703,30 +761,33 @@ def profile(
             target = backend.target.Target.current()
             # init candidate ops
             func_key = "{target}.{op}.config".format(
-                target=target.name(), op=self._attrs["op"]
+                target=target.name(),
+                op=self._attrs["op"],
             )
             func = registry.get(func_key)
-            func(self._attrs)
+            func(self._attrs, dtype=self._attrs["inputs"][0]._attrs["dtype"])
+        target = backend.target.Target.current()
+        force_cache = environ.force_profiler_cache()
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
-            target = backend.target.Target.current()
             # if in CI just choose minimal configs
             # workspace is a hack just provides 102400 Byte
-            if target.use_dummy_profiling_results():
+            if target.use_dummy_profiling_results() and not force_cache:
                 algo = target.select_minimal_algo(
                     list(self._attrs["op_instance"].keys())
                 )
-                logger.info(__name__, f"Select minimal algo {algo} for CI")
+                _LOGGER.info(f"Select minimal algo {algo} for CI")
                 self._attrs["exec_path"][wkl].algo = algo
                 self._attrs["workspace"] = 102400
             elif self._attrs["exec_path"][wkl].algo != "":
                 # we have cached best algo
                 return
             else:
-                self._profile_single_workload(profiler_prefix, wkl, profiler_runner)
+                self._profile_single_workload(
+                    profiler_prefix, wkl, profiler_runner, force_cache
+                )
 
     def gen_function(self) -> str:
         """Generates the function code for the gemm op for the current target.
@@ -777,6 +838,12 @@ def _sanity_check(self, a: Tensor, b: Tensor):
                     b_shapes
                 )
             )
+        if not is_same_dtype(a.dtype(), b.dtype()):
+            raise RuntimeError(
+                "gemm operand A and B should have the same data type! Current A: {atype}, B: {btype}.".format(
+                    atype=a.dtype(), btype=b.dtype()
+                )
+            )
 
     def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         """Call the gemm op.
@@ -801,13 +868,18 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
 
 
 def _profiler_results_groupby_key(instance):
+    if backend.target.Target.current().name() == "rocm":
+        return (
+            instance[1]["op"],  # unique op name
+            instance[3],  # profiler key (gemm shape)
+        )
     return (
         instance[1]["name"],  # unique op name
         instance[2],  # profiler executable
@@ -856,6 +928,7 @@ def postprocess_results(self):
             self._instances,
             key=_profiler_results_groupby_key,
         ):
+            group = list(group)
             min_runtime_results = min(group, key=_profiler_group_reduce_min_key)
             (
                 (best_algo, runtime, workspace),
@@ -864,12 +937,24 @@ def postprocess_results(self):
                 exec_key,
                 split_k,
             ) = min_runtime_results
-            func_attrs["exec_path"][exec_key].algo = best_algo
-            func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
-            func_attrs["split_k"] = split_k
+            if target.name() == "rocm":
+                for results in group:
+                    (
+                        (_, _, _),
+                        func_attrs,
+                        _,
+                        _,
+                        _,
+                    ) = results
+                    func_attrs["exec_path"][exec_key].algo = best_algo
+                    func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                    func_attrs["split_k"] = split_k
+            else:
+                func_attrs["exec_path"][exec_key].algo = best_algo
+                func_attrs["workspace"] = max(func_attrs["workspace"], workspace)
+                func_attrs["split_k"] = split_k
 
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 f"Profiler ({profiler_filename} {exec_key}) selected kernel: "
                 f"{best_algo=} {workspace=} {split_k=}",
             )
@@ -879,10 +964,14 @@ def postprocess_results(self):
             cache_record = GemmRecordEntry(
                 exec_entry=exec_key,
                 exec_entry_sha1=exec_entry_sha1,
-                dtype_a=tmp_op.A.element.value,
-                dtype_b=tmp_op.B.element.value,
-                dtype_c=tmp_op.C.element.value,
-                dtype_acc=tmp_op.accumulator_type().value,
+                # 1 is subtracted from the type enum values for consistency with the existing
+                # cache databases; due to the "void" type being added to the DataType enum as
+                # the very first enum member (and shifting the values of other enum members) in
+                # https://github.com/NVIDIA/cutlass/commit/7c04f954151f606e60608061e891785fba229ae2
+                dtype_a=tmp_op.A.element.value - 1,
+                dtype_b=tmp_op.B.element.value - 1,
+                dtype_c=tmp_op.C.element.value - 1,
+                dtype_acc=tmp_op.accumulator_type().value - 1,
                 major_a=tmp_op.A.layout.value,
                 major_b=tmp_op.B.layout.value,
                 major_c=tmp_op.C.layout.value,
@@ -897,4 +986,4 @@ def postprocess_results(self):
             try:
                 target.insert_profile_cache("gemm", cache_record.__dict__)
             except Exception as e:
-                logger.warning(__name__, e)
+                _LOGGER.warning(e)
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
index 42023d1dc..04052766b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr.py
@@ -16,8 +16,8 @@
 GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -40,7 +40,7 @@ def __init__(self):
         self._attrs["op"] = "gemm_rcr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
index 4372e19f4..1b327b93c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias.py
@@ -15,9 +15,9 @@
 """
 GEMM Specialization: GEMM_RCR(A, B) + Bias
 """
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -93,7 +93,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
index 9f7b92d05..faea82cd5 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: GEMM_RCR(A, B) + Bias + D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
index 35026663d..eceba570f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: GEMM_RCR(A, B) + Bias + D0 + D1
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
index 2b6eb5312..1838cb33d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_add_relu.py
@@ -16,7 +16,9 @@
 GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0 + D1)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
index 824114a01..c084051d0 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_add_relu.py
@@ -16,7 +16,9 @@
 GEMM Specialization: RELU(GEMM_RCR(A, B) + Bias + D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
index d55457f98..8944a0113 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_broadcast.py
@@ -17,9 +17,9 @@
 BinaryOp2(BinaryOp1(UnaryOp(TensorOp(X) + bias), residual1), residual2)
 """
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rcr_bias
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
@@ -27,7 +27,7 @@
 class gemm_rcr_bias_broadcast(gemm_rcr_bias):
     def __init__(self):
         super().__init__()
-        self._attrs["epilogue"] = "LinearCombinationResidualBlockV2"
+        self._attrs["epilogue"] = "LinearCombinationResidualBlock"
 
     @staticmethod
     def is_valid_inputs(*inputs):
@@ -49,7 +49,7 @@ def is_valid_inputs(*inputs):
             for d in inputs[3:]:
                 d_shape = d.shape()
                 if d_shape != base_shape:
-                    msg = "Additional elementwise shape {d_shape} doesn't match gemm_bias' shape {base_shape}"
+                    msg = f"Additional elementwise shape {d_shape} doesn't match gemm_bias' shape {base_shape}"
                     return False, msg
 
         return True, msg
@@ -68,7 +68,7 @@ def __call__(
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
index 7c72ac636..743ade763 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_fast_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: FastGELU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
index b8c7a33ce..34157307f 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: GELU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
index 4ee004262..b658c243d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_hardswish.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: HardSwish(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
index 486821c65..0e4906a8e 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul.py
@@ -16,7 +16,9 @@
 GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
index 0039992d6..b04378d9d 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_add.py
@@ -16,7 +16,9 @@
 GEMM Specialization: (GEMM_RCR(A, B) + Bias) * D0 + D1
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
index 72f1f6ea3..cbe25f593 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_mul_tanh.py
@@ -16,7 +16,9 @@
 GEMM Specialization: TANH((GEMM_RCR(A, B) + Bias) * D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
index 97b199b66..f95a89d61 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_permute.py
@@ -18,12 +18,12 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from . import gemm_rcr_bias
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -53,7 +53,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
index 99318ff49..6c6307d76 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_relu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
index b65c6f0a6..f4f868328 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """
 Sigmoid(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
index bbf2f133a..a2c35956b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul.py
@@ -16,7 +16,9 @@
 GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias) * D0
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
index b26d6fe4d..a9eebbbda 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_sigmoid_mul_tanh.py
@@ -16,7 +16,9 @@
 GEMM Specialization: Tanh(Sigmoid(GEMM_RCR(A, B) + Bias) * D0)
 """
 
-from .gemm_rcr_bias_broadcast import gemm_rcr_bias_broadcast
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias_broadcast import (
+    gemm_rcr_bias_broadcast,
+)
 
 # pylint: disable=C0103, W0223, W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
index ffb285ef8..c4138269c 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_swish.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: SiLU(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
index 53b35e879..bf3d7ef4a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_bias_tanh.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: Tanh(GEMM_RCR(A, B) + Bias)
 """
-from . import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
index 1ffed29a4..264d9df5b 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_fast_gelu.py
@@ -15,7 +15,7 @@
 """
 GEMM Specialization: FastGELU(GEMM_RCR(A, B))
 """
-from . import gemm_rcr
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
 
 # pylint: disable=C0103,W0223,W0221
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
index 75560e1ba..882809860 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute.py
@@ -21,13 +21,13 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from ...base import IntImm, IntVar, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from . import gemm_rcr
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -56,7 +56,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
new file mode 100644
index 000000000..062e1c242
--- /dev/null
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rcr_permute_elup1.py
@@ -0,0 +1,28 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A specialization of gemm_rcr_permute applying ELU + 1 as epilogue.
+"""
+
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_permute
+
+# pylint: disable=C0103,W0223,W0221,W0613
+
+
+class gemm_rcr_permute_elup1(gemm_rcr_permute):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._attrs["op"] = "gemm_rcr_permute_elup1"
+        self._attrs["epilogue"] = "LinearCombinationELUp1"
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
index cee26e810..aca9b03a3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr.py
@@ -19,8 +19,8 @@
 When use for `linear`, need set A->Data, B->Weight
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "gemm_rrr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
+            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
index aa5060afe..0eff459b8 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias.py
@@ -15,9 +15,9 @@
 """
 gemm rrr with bias
 """
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import gemm_rrr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -80,7 +80,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
index b1b75ee6d..57207a5b6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_bias_permute.py
@@ -18,13 +18,13 @@
 
 from typing import Tuple
 
-from aitemplate.testing import detect_target
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from . import gemm_rrr_bias
+from aitemplate.testing import detect_target
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -47,7 +47,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
index 24b9bc276..bf2604224 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/gemm_rrr_permute.py
@@ -21,11 +21,11 @@
 
 from typing import Tuple
 
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
 
-from . import gemm_rrr
+from aitemplate.compiler.ops.gemm_universal import gemm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -46,7 +46,7 @@ def __call__(self, a: Tensor, b: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
index 6777ab228..39e6bdfe1 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr.py
@@ -15,25 +15,28 @@
 """
 Grouped GEMM Specialization for A[RowMajor], B[ColMajor], C[RowMajor]
 """
+import logging
 import re
 from collections import OrderedDict
 from typing import List
 
 import jinja2
 
-from aitemplate.compiler.stable_set import StableSet
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import ExecItem, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
+from aitemplate.compiler.ops.tensor import concatenate
 
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import logger
-from ...base import ExecItem, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..tensor import concatenate
-from . import gemm_common as common
-from .gemm_rcr import gemm_rcr
+from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
+
+_LOGGER = logging.getLogger(__name__)
+
 SHAPE_EVAL_TEMPLATE = jinja2.Template(
     """
 {% for operand_dim in group_operand_dims %}
@@ -101,7 +104,7 @@ def __init__(self):
         self._attrs["int_state_flag"] = 0
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -163,7 +166,7 @@ def _concat_strided_outputs(self, outputs, output_stride_dim):
             offset += output_tensor._attrs["shape"][output_stride_dim]._attrs["values"][
                 0
             ]
-            from ...transform import transform_utils
+            from aitemplate.compiler.transform import transform_utils
 
             transform_utils.remove_tensor_from_sorted_graph(output_tensor)
         return cat_output
@@ -275,20 +278,16 @@ def gen_profiler(
         filter_func = registry.get(func_key)
         # run compile-time filter
         new_op_instance = OrderedDict(
-            {
-                k: v
-                for k, v in self._attrs["op_instance"].items()
-                if filter_func(k, self._attrs, ab_alignments[0])
-            }
+            (k, v)
+            for k, v in self._attrs["op_instance"].items()
+            if filter_func(k, self._attrs, ab_alignments[0])
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Filtered profiler kernels for {self._attrs['op']}: reduced the "
             f"number of generated kernels from {len(self._attrs['op_instance'])} "
             f"to {len(new_op_instance)}",
         )
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             f"Group_gemm profiler valid configs: {sorted(new_op_instance.keys())}",
         )
         self._attrs["op_instance"] = new_op_instance
@@ -299,7 +298,7 @@ def gen_profiler(
             )
             func = registry.get(func_key)
             profiler_filename = self._get_profiler_filename()
-            logger.info(__name__, f"generating {profiler_filename=}")
+            _LOGGER.info(f"generating {profiler_filename=}")
             return func(
                 self._attrs, workdir, profiler_filename, self.shape_eval_template
             )
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
index ac348062f..2cc5ced97 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias.py
@@ -20,12 +20,15 @@
 
 import jinja2
 
-from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.base import ExecItem, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal.group_gemm_rcr import (
+    group_gemm_rcr,
+    SHAPE_EVAL_TEMPLATE,
+)
 
-from ...base import ExecItem, Tensor
-from ...tensor_accessor import TensorAccessor
-from .gemm_rcr_bias import gemm_rcr_bias
-from .group_gemm_rcr import group_gemm_rcr, SHAPE_EVAL_TEMPLATE
+from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
index 3094eb71f..e1b62eb81 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_relu.py
@@ -15,7 +15,7 @@
 """Grouped GEMM Specialization: ReLU(GEMM_RCR(A, B) + Bias)
 """
 
-from . import group_gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import group_gemm_rcr_bias
 
 # pylint: disable=C0103,W0223
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
index 8601144a3..5098e3285 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/group_gemm_rcr_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """Grouped GEMM Specialization: Sigmoid(GEMM_RCR(A, B) + Bias)
 """
 
-from . import group_gemm_rcr_bias
+from aitemplate.compiler.ops.gemm_universal import group_gemm_rcr_bias
 
 # pylint: disable=C0103,W0223
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
index 0f8b3d7d4..521510975 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr.py
@@ -16,10 +16,11 @@
 GEMM Specialization: A.permute(0, 2, 1)[col] @ B[col]
 """
 
-from ...base import _create_host_zero_tensor, IntImm, Tensor
-from ..tensor import concatenate
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
+from aitemplate.compiler.ops.tensor import concatenate
+from aitemplate.utils import alignment
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -46,7 +47,7 @@ def __init__(self):
         self._attrs["op"] = "perm021fc_ccr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, k)
+            return common.default_align_ab(m, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
@@ -120,7 +121,7 @@ def _align_ab(self, a: Tensor, b: Tensor):
             )
         k = ak._attrs["values"][0]
 
-        if k % 2 != 0:
+        if not alignment.valid_alignment(k % 2, a.dtype()):
             pad_k = int((k // 8 + 1) * 8)
 
             pad_a = _create_host_zero_tensor(
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
index 378f8f33b..7ded98ca3 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias.py
@@ -16,11 +16,10 @@
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias)
 """
 
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.gemm_universal import perm021fc_ccr
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import Tensor
-from . import perm021fc_ccr
-
 # pylint: disable=C0103, W0223, W0221
 
 
@@ -68,7 +67,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
index e48701330..c57c3d0d1 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_ccr_bias_permute.py
@@ -15,10 +15,10 @@
 """
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[col] + Bias).permute(0, 2, 1)
 """
-from ...base import Tensor
-from ...tensor_accessor import TensorAccessor
-from ..common import reshape
-from .perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.compiler.ops.gemm_universal.perm021fc_ccr_bias import perm021fc_ccr_bias
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0223,W0221,W0613
 
@@ -55,13 +55,15 @@ def __init__(self, layout="021"):
     def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         a, b = self._align_ab(a, b)
         self._attrs["inputs"] = [a, b, bias]
-        self._attrs["input_accessors"] = [TensorAccessor(a), TensorAccessor(b)]
+        self._attrs["input_accessors"] = [
+            TensorAccessor(tensor) for tensor in self._attrs["inputs"]
+        ]
         self._set_depth()
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
 
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
index ba4d52fd7..026435330 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc.py
@@ -17,9 +17,9 @@
 Note: This op's output is a ColMajor
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -50,7 +50,7 @@ def __init__(self):
         self._attrs["op"] = "perm021fc_crc"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(m, n)
+            return common.default_align_ab(m, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
index 43b17fdac..7443bfa84 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm021fc_crc_bias.py
@@ -16,11 +16,10 @@
 GEMM Specialization: (A.permute(0, 2, 1)[col] @ B[row] + Bias)
 """
 
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm021fc_crc
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-from ...base import IntImm, Tensor
-from . import perm021fc_crc
-
 # pylint: disable=C0103, W0223, W0221
 
 
@@ -71,7 +70,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a._attrs["dtype"])
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
index abd6d06a6..54cd4c2ee 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -43,7 +43,7 @@ def __init__(self):
         self._attrs["op"] = "perm102_bmm_rcr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, k)
+            return common.default_align_ab(k, k, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
index 326291002..90486b63a 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rcr_bias.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, n, k](col))
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import perm102_bmm_rcr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm102_bmm_rcr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
@@ -82,7 +82,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [
             TensorAccessor(tensor) for tensor in self._attrs["outputs"]
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
index 2d57a75cb..a20e138a6 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row))
 """
 
-from ...base import IntImm, Tensor
-from . import gemm_common as common
-from .bmm import bmm
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import gemm_common as common
+from aitemplate.compiler.ops.gemm_universal.bmm import bmm
 
 # pylint: disable=C0103, W0223, W0221, W0613
 
@@ -44,7 +44,7 @@ def __init__(self):
         self._attrs["op"] = "perm102_bmm_rrr"
 
         def cal_align_ab(m, n, k):
-            return common.default_align_ab(k, n)
+            return common.default_align_ab(k, n, self._attrs["inputs"][0].dtype())
 
         self._attrs["f_ab_alignment"] = cal_align_ab
 
diff --git a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
index 59d5fd4de..c85d1bf10 100644
--- a/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
+++ b/python/aitemplate/compiler/ops/gemm_universal/perm102_bmm_rrr_bias.py
@@ -16,9 +16,9 @@
 Batch GEMM specialization: C[m, b, n](row) = bmm(A[m, b, k](row), B[b, k, n](row)) + bias[b, n]
 """
 
-from ...base import IntImm, Tensor
-from ...tensor_accessor import TensorAccessor
-from . import perm102_bmm_rrr
+from aitemplate.compiler.base import IntImm, Tensor
+from aitemplate.compiler.ops.gemm_universal import perm102_bmm_rrr
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103, W0223, W0221
 
@@ -65,7 +65,7 @@ def __call__(self, a: Tensor, b: Tensor, bias: Tensor) -> Tensor:
         self._sanity_check(a, b)
         output_shape = self._infer_shapes(a, b, bias)
         self._extract_epilogue_alignment(output_shape)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=a.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
diff --git a/python/aitemplate/compiler/ops/groupnorm/__init__.py b/python/aitemplate/compiler/ops/groupnorm/__init__.py
index e51549e67..cb5fd3174 100644
--- a/python/aitemplate/compiler/ops/groupnorm/__init__.py
+++ b/python/aitemplate/compiler/ops/groupnorm/__init__.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from .groupnorm import group_norm
-from .groupnorm_swish import group_norm_swish
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm_swish import group_norm_swish
 
 __all__ = ["group_norm", "group_norm_swish"]
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
index 5269d5261..b93d0f247 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm.py
@@ -15,6 +15,8 @@
 """
 Operator definition for groupnorm.
 """
+import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -23,18 +25,27 @@
 
 import jinja2
 
-from aitemplate.testing import detect_target
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import logger
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.testing import detect_target
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
 
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -43,6 +54,19 @@
 """
 )
 
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}HI = {{x_dim1}};
+{{indent}}{{dtype}}WI = {{x_dim2}};
+{{indent}}{{dtype}}CI = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = {{x_dim3}};
+"""
+)
+
 
 class group_norm(Operator):
     """Standalone group norm op.
@@ -58,19 +82,21 @@ def __init__(self, num_groups: int, num_channels: int) -> None:
             self._attrs["has_profiler"] = True
         self._attrs["num_channels"] = num_channels
         self._attrs["workspace"] = 0
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
 
     @staticmethod
     def check_shapes(x_shapes, gamma_shapes, beta_shapes, num_groups):
         # check last dim can be divided by num_groups
         # minimal group: 8
-        if len(gamma_shapes) != len(beta_shapes):
-            raise RuntimeError(
-                f"Gamma and beta must have the same number of dimensions, but got {len(gamma_shapes)} and {len(beta_shapes)}"
-            )
-        if x_shapes[-1].value() != gamma_shapes[0].value():
-            raise RuntimeError(
-                f"Input last dim {x_shapes[-1]} must be equal to gamma dim {gamma_shapes[0]}"
-            )
+        if gamma_shapes is not None and beta_shapes is not None:
+            if len(gamma_shapes) != len(beta_shapes):
+                raise RuntimeError(
+                    f"Gamma and beta must have the same number of dimensions, but got {len(gamma_shapes)} and {len(beta_shapes)}"
+                )
+            if x_shapes[-1].value() != gamma_shapes[0].value():
+                raise RuntimeError(
+                    f"Input last dim {x_shapes[-1]} must be equal to gamma dim {gamma_shapes[0]}"
+                )
         if x_shapes[-1].value() % num_groups != 0:
             raise RuntimeError(
                 f"Channel dim {gamma_shapes[0]} must be divisible by num_groups {num_groups}"
@@ -101,9 +127,52 @@ def _sanity_check(self, x, gamma, beta):
 
     def _infer_shapes(self, x: Tensor):
         """Infer shapes for groupnorm."""
-
         return x._attrs["shape"]
 
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            div="//",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes_v2(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            x.shape()[0],
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+        ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        output_shape[1]._attrs["symbolic_value"] = in_h
+        output_shape[2]._attrs["symbolic_value"] = in_w
+        return output_shape
+
     def __call__(
         self,
         x: Tensor,
@@ -129,7 +198,7 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
 
         batch_size = output_shape[0]._attrs["values"][-1]
         self._attrs["workspace"] = 8 * batch_size * self._attrs["num_groups"]
@@ -240,7 +309,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -255,7 +324,11 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
 
         if len(result) == 0:
             raise RuntimeError(
-                "Profile workload: " f"{exec_key}" " failed. " f"Results: {result}."
+                "Profile workload: "
+                f"{self._attrs['op']} "
+                f"{exec_key}"
+                " failed. "
+                f"Results: {result}."
             )
 
         out = min(result, key=lambda x: x[1].duration)
@@ -313,8 +386,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
@@ -404,9 +476,6 @@ def _extract_exec_path(self, dynamic_profiling_strategy=DynamicProfileStrategy.M
             )
             self._attrs["exec_path"][exec_item.profiling_key] = exec_item
 
-    def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [f"num_groups={self._attrs['num_groups']}"]
-
     def _get_op_attributes(self):
         return {
             "num_groups": self._attrs["num_groups"],
diff --git a/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
index 9aebe87c7..89e6a8bab 100644
--- a/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
+++ b/python/aitemplate/compiler/ops/groupnorm/groupnorm_swish.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 
-from .groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
 
 
 class group_norm_swish(group_norm):
diff --git a/python/aitemplate/compiler/ops/jagged/__init__.py b/python/aitemplate/compiler/ops/jagged/__init__.py
new file mode 100644
index 000000000..7d0c2a0ce
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/__init__.py
@@ -0,0 +1,25 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from aitemplate.compiler.ops.jagged.jagged_lengths_to_offsets import (
+    jagged_lengths_to_offsets,
+)
+from aitemplate.compiler.ops.jagged.jagged_lengths_to_presences import (
+    jagged_lengths_to_presences,
+)
+
+__all__ = [
+    "jagged_lengths_to_offsets",
+    "jagged_lengths_to_presences",
+]
diff --git a/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..bde349977
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_offsets.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define jagged_lengths_to_offsets op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class jagged_lengths_to_offsets(Operator):
+    """
+    Given a 1D Tensor of lengths of the sequences in a jagged Tensor,
+    returns the corresponding 1D Tensor of offsets. The latter is the
+    inclusive sum of the lengths prepended by a zero.
+
+    Args:
+        lengths (Tensor): 1D Tensor of sequence lengths, [B]-shaped.
+    Returns:
+        offsets (Tensor): 1D Tensor of sequence offsets, [B+1]-shaped.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "jagged_lengths_to_offsets"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shape(self, lengths: Tensor) -> List[IntVar]:
+        batch_size = lengths.shape()[0]
+        # the offsets are 1 element longer than the lengths
+        offsets_size = IntVar(
+            values=[
+                batch_size.lower_bound() + 1,
+                batch_size.upper_bound() + 1,
+            ]
+        )
+        return [offsets_size]
+
+    def __call__(
+        self,
+        lengths: Tensor,
+    ) -> Tensor:
+        if len(lengths.shape()) != 1:
+            raise ValueError(f"The lengths Tensor must be 1D, but got {lengths=}.")
+        if lengths._attrs["dtype"] not in ("int32", "int64"):
+            raise ValueError(
+                f"The lengths Tensor must be int32 or int64, but got {lengths=}."
+            )
+
+        self._attrs["inputs"] = [lengths]
+        self._set_depth()
+        output_shape = self._infer_shape(lengths)
+        offsets = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=lengths._attrs["dtype"],
+        )
+
+        # set the workspace to empirically determined large enough value
+        sizeof_dtype = 4 if lengths._attrs["dtype"] == "int32" else 8
+        self._attrs["workspace"] = max(
+            2**16,
+            16 * sizeof_dtype * offsets.shape()[0].upper_bound(),
+        )
+
+        self._attrs["outputs"] = [offsets]
+        return offsets
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py
new file mode 100644
index 000000000..7a372d8d2
--- /dev/null
+++ b/python/aitemplate/compiler/ops/jagged/jagged_lengths_to_presences.py
@@ -0,0 +1,89 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define jagged_lengths_to_presences op
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
+
+
+class jagged_lengths_to_presences(Operator):
+    """
+    Given a 1D Tensor of lengths of the sequences in a jagged Tensor,
+    returns a 2D Tensor of presences indicating where the data exists
+    and where not. The dtype of presences Tensor is configurable.
+
+    Args:
+        lengths (Tensor): 1D Tensor of sequence lengths, [B]-shaped.
+        max_seq_len (int): Maximum possible sequence length.
+    Returns:
+        presences (Tensor): 2D Tensor of presences, [B, max_seq_len]-shaped.
+                            presences[i, j] = (dtype)(j < lenghts[i])
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "jagged_lengths_to_presences"
+        self._attrs["has_profiler"] = False
+
+    def _infer_shape(
+        self,
+        lengths: Tensor,
+        max_seq_len: int,
+    ) -> List[IntVar]:
+        batch_size = lengths.shape()[0]
+        return [batch_size, IntImm(max_seq_len)]
+
+    def __call__(
+        self,
+        lengths: Tensor,
+        max_seq_len: int,
+        dtype: str = "bool",
+    ) -> Tensor:
+        if len(lengths.shape()) != 1:
+            raise ValueError(f"The lengths Tensor must be 1D, but got {lengths=}.")
+        if lengths._attrs["dtype"] not in ("int32", "int64"):
+            raise ValueError(
+                f"The lengths Tensor must be int32 or int64, but got {lengths=}."
+            )
+        if not isinstance(max_seq_len, int) or max_seq_len <= 0:
+            raise ValueError(
+                f"max_seq_len must be a positive integer, but got {max_seq_len=}."
+            )
+
+        # validation inside
+        get_dtype_size(dtype)
+
+        self._attrs["inputs"] = [lengths]
+        self._set_depth()
+
+        output_shape = self._infer_shape(lengths, max_seq_len)
+        presences = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=dtype,
+        )
+
+        self._attrs["outputs"] = [presences]
+        return presences
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/layernorm/__init__.py b/python/aitemplate/compiler/ops/layernorm/__init__.py
index 361b3a05e..ebeb4b39d 100644
--- a/python/aitemplate/compiler/ops/layernorm/__init__.py
+++ b/python/aitemplate/compiler/ops/layernorm/__init__.py
@@ -12,11 +12,17 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .batch_layernorm_sigmoid_mul import batch_layernorm_sigmoid_mul
-from .group_layernorm import group_layernorm
-from .group_layernorm_sigmoid_mul import group_layernorm_sigmoid_mul
-from .layernorm import layernorm
-from .layernorm_sigmoid_mul import layernorm_sigmoid_mul
+from aitemplate.compiler.ops.layernorm.batch_layernorm_sigmoid_mul import (
+    batch_layernorm_sigmoid_mul,
+)
+from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
+from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
+    group_layernorm_sigmoid_mul,
+)
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
+from aitemplate.compiler.ops.layernorm.layernorm_sigmoid_mul import (
+    layernorm_sigmoid_mul,
+)
 
 
 __all__ = [
diff --git a/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
index 5ff2dd079..647f25485 100644
--- a/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/batch_layernorm_sigmoid_mul.py
@@ -19,8 +19,8 @@
 """
 from typing import List
 
-from ...base import IntImm
-from .layernorm import layernorm
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.layernorm.layernorm import layernorm
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
index 9092e8ea9..3dd310a64 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm.py
@@ -64,7 +64,7 @@ def _sanity_check(self, all_inputs):
             == len(self._attrs["normalized_shape"])
         )
 
-        for (x, gamma, beta, normalized_shape) in zip(
+        for x, gamma, beta, normalized_shape in zip(
             inputs, gammas, betas, self._attrs["normalized_shape"]
         ):
             (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(
@@ -152,8 +152,14 @@ def __call__(
         self._attrs["output_accessors"] = []
         for x in inputs:
             output_shape = self._infer_shapes(x)
-            output = Tensor(output_shape, src_ops={self})
+            output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
             self._attrs["outputs"].append(output)
             self._attrs["output_accessors"].append(TensorAccessor(output))
             self._attrs["input_accessors"].append(TensorAccessor(x))
         return self._attrs["outputs"]
+
+    def _args_for_pseudo_code(self):
+        res = []
+        for shapes in self._attrs["normalized_shape"]:
+            res.append(",".join([str(s.symbolic_value()) for s in shapes]))
+        return [f"normalized_shape={res}"]
diff --git a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
index ed13b6760..1fb0b85e4 100644
--- a/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/group_layernorm_sigmoid_mul.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import IntImm
-from .group_layernorm import group_layernorm
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm.py b/python/aitemplate/compiler/ops/layernorm/layernorm.py
index a6cf3245d..f4bd01bfb 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm.py
@@ -15,6 +15,7 @@
 """
 Operator definition for layernorm.
 """
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -23,20 +24,28 @@
 
 import jinja2
 
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
 from aitemplate.testing import detect_target
 from aitemplate.utils import shape_utils
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
-from ....utils import logger
-from ...base import DynamicProfileStrategy, ExecItem, IntImm, IntVar, Operator, Tensor
-from ...tensor_accessor import TensorAccessor
-from ..softmax.cache_entry import NormQueryEntry, NormRecordEntry
-
 # pylint: disable=C0103,W0221,W0102,W0223
 
 
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
 {{indent}}if ({{cond}}) {
@@ -123,6 +132,13 @@ def _sanity_check(self, x, gamma, beta):
             )
         (x_shape, gamma_shape, beta_shape) = layernorm.get_input_shapes(x, gamma, beta)
 
+        expected_dtype = x.dtype()
+        for param, name in ((gamma, "gamma"), (beta, "beta")):
+            if param is not None and param.dtype() != expected_dtype:
+                raise NotImplementedError(
+                    f"Layernorm doesn't support type promotions; expected {expected_dtype} but got {name} with dtype {param.dtype()}"
+                )
+
         layernorm.check_shapes(x_shape, gamma_shape, beta_shape, normalized_shape)
 
     def _infer_shapes(self, x: Tensor):
@@ -163,7 +179,7 @@ def __call__(
         self._sanity_check(x, gamma, beta)
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         self._attrs["output_accessors"] = [TensorAccessor(output)]
         return output
@@ -315,7 +331,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -388,8 +404,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
@@ -428,3 +443,8 @@ def gen_profiler(
 
     def _get_op_attributes(self):
         return {"normalized_shape": self._attrs["default_normalized_shape"]}
+
+    def _args_for_pseudo_code(self):
+        return [
+            f"normalized_shape={[s.symbolic_value() for s in self._attrs['normalized_shape']]}"
+        ]
diff --git a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
index e34a2a019..4d3b198ab 100644
--- a/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
+++ b/python/aitemplate/compiler/ops/layernorm/layernorm_sigmoid_mul.py
@@ -15,12 +15,11 @@
 """
 Operator definition for layernorm_sigmoid_mul.
 """
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
 from aitemplate.compiler.stable_set import StableSet
-
-from .... import backend
-from ....backend import registry
-from ...base import Operator
-from ...tensor_accessor import TensorAccessor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -102,3 +101,8 @@ def gen_function(self) -> str:
         )
         func = registry.get(func_key)
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return [
+            f"normalized_shape={[s.symbolic_value() for s in self._attrs['normalized_shape']]}"
+        ]
diff --git a/python/aitemplate/compiler/ops/padding/__init__.py b/python/aitemplate/compiler/ops/padding/__init__.py
index c3b9b2f3c..6448b85a2 100644
--- a/python/aitemplate/compiler/ops/padding/__init__.py
+++ b/python/aitemplate/compiler/ops/padding/__init__.py
@@ -15,9 +15,10 @@
 """
 Padding ops module init.
 """
-from .nhwc3to4 import nhwc3to4
-from .nhwc3to8 import nhwc3to8
-from .pad_last_dim import pad_last_dim
+from aitemplate.compiler.ops.padding.ndhwc3to8 import ndhwc3to8
+from aitemplate.compiler.ops.padding.nhwc3to4 import nhwc3to4
+from aitemplate.compiler.ops.padding.nhwc3to8 import nhwc3to8
+from aitemplate.compiler.ops.padding.pad_last_dim import pad_last_dim
 
 
-__all__ = ["nhwc3to8", "nhwc3to4", "pad_last_dim"]
+__all__ = ["ndhwc3to8", "nhwc3to8", "nhwc3to4", "pad_last_dim"]
diff --git a/python/aitemplate/compiler/ops/padding/ndhwc3to8.py b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
new file mode 100644
index 000000000..738d249f8
--- /dev/null
+++ b/python/aitemplate/compiler/ops/padding/ndhwc3to8.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Common NDHWC3to8 padding op
+"""
+import itertools
+from typing import List
+
+import jinja2
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
+
+# pylint: disable=C0103,W0221
+
+
+SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{y_dim0}} = NO;
+{{indent}}{{y_dim1}} = DO;
+{{indent}}{{y_dim2}} = HO;
+{{indent}}{{y_dim3}} = WO;
+"""
+)
+
+SHAPE_FUNC_TEMPLATE = jinja2.Template(
+    """
+{{indent}}{{dtype}}NI = {{x_dim0}};
+{{indent}}{{dtype}}DI = {{x_dim1}};
+{{indent}}{{dtype}}HI = {{x_dim2}};
+{{indent}}{{dtype}}WI = {{x_dim3}};
+{{indent}}{{dtype}}NO = NI;
+{{indent}}{{dtype}}DO = DI;
+{{indent}}{{dtype}}HO = HI;
+{{indent}}{{dtype}}WO = WI;
+{{indent}}{{dtype}}CO = 8;
+"""
+)
+
+
+class ndhwc3to8(Operator):
+    """
+    Pad the 3-channel input data to 8-channel.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "ndhwc3to8"
+        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
+        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
+
+    def _infer_shape(self, x: List[int]):
+        eval_func = self.shape_eval_template.render(
+            indent="",
+            dtype="",
+            x_dim0=x[0],
+            x_dim1=x[1],
+            x_dim2=x[2],
+            x_dim3=x[3],
+            x_dim4=x[4],
+        )
+        output = {}
+        exec(eval_func, output)  # noqa: P204
+        return [
+            int(output["NO"]),
+            int(output["DO"]),
+            int(output["HO"]),
+            int(output["WO"]),
+            int(output["CO"]),
+        ]
+
+    def _infer_shapes(self, x: Tensor):
+        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
+        x_shapes = itertools.product(*x_shape_values)
+        # run infershape for each
+        y_shapes = []
+        for x_shape in x_shapes:
+            y_shape = self._infer_shape(x_shape)
+            y_shapes.append(y_shape)
+
+        def unique(vector):
+            return sorted(set(vector))
+
+        output_shape = [
+            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
+            shape_utils.gen_int_var(unique([d[4] for d in y_shapes])),
+        ]
+        return output_shape
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
+        self._attrs["outputs"] = [output]
+        return output
+
+    def _get_op_attributes(self):
+        return {
+            "padded_channels": self._attrs["op"].split("to")[-1],
+            "shape_func_template": self.shape_eval_template,
+        }
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+            self.shape_eval_template,
+            self.shape_save_template,
+        )
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to4.py b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
index 7cce9a94e..03c748568 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc3to4.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to4.py
@@ -18,7 +18,7 @@
 
 import jinja2
 
-from .nhwc_pad_common import nhwc_pad_common
+from aitemplate.compiler.ops.padding.nhwc_pad_common import nhwc_pad_common
 
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
diff --git a/python/aitemplate/compiler/ops/padding/nhwc3to8.py b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
index 7d4581c8e..aeb502ded 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc3to8.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc3to8.py
@@ -18,7 +18,7 @@
 
 import jinja2
 
-from .nhwc_pad_common import nhwc_pad_common
+from aitemplate.compiler.ops.padding.nhwc_pad_common import nhwc_pad_common
 
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
diff --git a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
index b673a774e..96c5eb0be 100644
--- a/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
+++ b/python/aitemplate/compiler/ops/padding/nhwc_pad_common.py
@@ -20,10 +20,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
 
@@ -90,7 +90,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/padding/pad_last_dim.py b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
index b25267f4e..6def61e73 100644
--- a/python/aitemplate/compiler/ops/padding/pad_last_dim.py
+++ b/python/aitemplate/compiler/ops/padding/pad_last_dim.py
@@ -19,9 +19,9 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ...base import IntImm, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
@@ -74,7 +74,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._attrs["inputs"] = [x]
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/pool/__init__.py b/python/aitemplate/compiler/ops/pool/__init__.py
index e0e5003fe..7cd9df61a 100644
--- a/python/aitemplate/compiler/ops/pool/__init__.py
+++ b/python/aitemplate/compiler/ops/pool/__init__.py
@@ -15,8 +15,8 @@
 """
 Pool module init.
 """
-from .avg_pool2d import avg_pool2d
-from .max_pool2d import max_pool2d
+from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
+from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 
 
 __all__ = ["avg_pool2d", "max_pool2d"]
diff --git a/python/aitemplate/compiler/ops/pool/avg_pool2d.py b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
index c113c1d5e..094968e72 100644
--- a/python/aitemplate/compiler/ops/pool/avg_pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/avg_pool2d.py
@@ -15,7 +15,7 @@
 """
 Avg_pool2d op.
 """
-from .pool2d import pool2d_base
+from aitemplate.compiler.ops.pool.pool2d import pool2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/pool/max_pool2d.py b/python/aitemplate/compiler/ops/pool/max_pool2d.py
index f92303f1b..f95144463 100644
--- a/python/aitemplate/compiler/ops/pool/max_pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/max_pool2d.py
@@ -15,7 +15,7 @@
 """
 Max_pool2d op.
 """
-from .pool2d import pool2d_base
+from aitemplate.compiler.ops.pool.pool2d import pool2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/pool/pool2d.py b/python/aitemplate/compiler/ops/pool/pool2d.py
index 5fd2bfd78..37bbb9151 100644
--- a/python/aitemplate/compiler/ops/pool/pool2d.py
+++ b/python/aitemplate/compiler/ops/pool/pool2d.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
@@ -127,7 +127,7 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x._attrs["shape"][0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
@@ -162,7 +162,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/reduce/__init__.py b/python/aitemplate/compiler/ops/reduce/__init__.py
index 037151e20..1fdff06c8 100644
--- a/python/aitemplate/compiler/ops/reduce/__init__.py
+++ b/python/aitemplate/compiler/ops/reduce/__init__.py
@@ -15,10 +15,10 @@
 """
 Reduce module init.
 """
-from .reduce_mean import reduce_mean
-from .reduce_sum import reduce_sum
-from .var import var
-from .vector_norm import vector_norm
+from aitemplate.compiler.ops.reduce.reduce_mean import reduce_mean
+from aitemplate.compiler.ops.reduce.reduce_sum import reduce_sum
+from aitemplate.compiler.ops.reduce.var import var
+from aitemplate.compiler.ops.reduce.vector_norm import vector_norm
 
 
 __all__ = ["reduce_mean", "reduce_sum", "var", "vector_norm"]
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_common.py b/python/aitemplate/compiler/ops/reduce/reduce_common.py
index 71dbec41c..1dafa717f 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_common.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_common.py
@@ -16,20 +16,24 @@
 Base operator definition for reduce-family ops.
 """
 import itertools
+import logging
 
 from typing import List
 
-from .... import backend
-from ....backend import registry
-from ....utils import logger, shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
-from ...dtype import get_dtype_size
-from ...tensor_accessor import TensorAccessor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class reduce_base(Operator):
     """The base class for reduce ops."""
 
@@ -249,11 +253,13 @@ def __call__(self, x: Tensor) -> Tensor:
         # Note that this is a temprary solution only for col-reduction reduce_sum
         # kernels that invoke cutlass's TensorReduction kernel. Once we have our
         # own implementation, we will remove the workaround.
-        if self._attrs["op"] == "reduce_sum" and (reduction_axes[0] != input_rank - 1):
+        if self._attrs["op"] == "reduce_sum" and (
+            self._attrs["reduction_axes"][0] != input_rank - 1
+        ):
             ws_size = self._compute_workspace_size(
-                x._attrs["shape"], reduction_axes[0], x.dtype()
+                x._attrs["shape"], self._attrs["reduction_axes"][0], x.dtype()
             )
-            logger.info(__name__, f'allocating {ws_size} for tensor {x._attrs["name"]}')
+            _LOGGER.info(f'allocating {ws_size} for tensor {x._attrs["name"]}')
             self._attrs["workspace"] = ws_size
         return output
 
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_mean.py b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
index 44fbc810d..37952f488 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_mean.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_mean.py
@@ -15,7 +15,7 @@
 """
 Reduce_mean op implementation.
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/reduce_sum.py b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
index 3f08d8b14..08d538e96 100644
--- a/python/aitemplate/compiler/ops/reduce/reduce_sum.py
+++ b/python/aitemplate/compiler/ops/reduce/reduce_sum.py
@@ -15,7 +15,7 @@
 """
 reduce_sum op
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/var.py b/python/aitemplate/compiler/ops/reduce/var.py
index 136783ee1..91117404d 100644
--- a/python/aitemplate/compiler/ops/reduce/var.py
+++ b/python/aitemplate/compiler/ops/reduce/var.py
@@ -15,7 +15,7 @@
 """
 var op implementation
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/reduce/vector_norm.py b/python/aitemplate/compiler/ops/reduce/vector_norm.py
index c4d445195..38ea5c367 100644
--- a/python/aitemplate/compiler/ops/reduce/vector_norm.py
+++ b/python/aitemplate/compiler/ops/reduce/vector_norm.py
@@ -16,7 +16,7 @@
 vector_norm op implementation that simulates pytorch's linalg.vector_norm.
 Currently, we only support L2 norm.
 """
-from .reduce_common import reduce_base
+from aitemplate.compiler.ops.reduce.reduce_common import reduce_base
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/softmax/__init__.py b/python/aitemplate/compiler/ops/softmax/__init__.py
index d1e1d89d0..61a8ad8ba 100644
--- a/python/aitemplate/compiler/ops/softmax/__init__.py
+++ b/python/aitemplate/compiler/ops/softmax/__init__.py
@@ -15,7 +15,7 @@
 """
 softmax module init
 """
-from .softmax import softmax
+from aitemplate.compiler.ops.softmax.softmax import softmax
 
 
 __all__ = ["softmax"]
diff --git a/python/aitemplate/compiler/ops/softmax/softmax.py b/python/aitemplate/compiler/ops/softmax/softmax.py
index 6c28c02af..d7208390b 100644
--- a/python/aitemplate/compiler/ops/softmax/softmax.py
+++ b/python/aitemplate/compiler/ops/softmax/softmax.py
@@ -15,6 +15,7 @@
 """
 Softmax op implementation
 """
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -23,16 +24,26 @@
 
 import jinja2
 
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import (
+    DynamicProfileStrategy,
+    ExecItem,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.softmax.cache_entry import NormQueryEntry, NormRecordEntry
+from aitemplate.compiler.ops.tensor.permute import permute
+
 from aitemplate.testing import detect_target
 
-from .... import backend
-from ....backend import registry
-from ....backend.target import Target
+from aitemplate.utils.tensor_utils import wrap_dim
+
 
-from ....utils import logger
-from ....utils.tensor_utils import wrap_dim
-from ...base import DynamicProfileStrategy, ExecItem, IntVar, Operator, Tensor
-from .cache_entry import NormQueryEntry, NormRecordEntry
+_LOGGER = logging.getLogger(__name__)
 
 EXEC_COND_TEMPLATE = jinja2.Template(
     """
@@ -193,16 +204,22 @@ def __call__(self, x: Tensor, dim: int = None) -> Tensor:
                 "flattening input tensor before normalization is not supported yet"
             )
         dim = wrap_dim(dim, x._rank())
-        if dim != x._rank() - 1:
-            raise NotImplementedError(
-                f"softmax currently only supports dim=x._rank() - 1, dim={dim}, x._rank()={x._rank()}"
-            )
+        tail_shapes = x.shape()[dim + 1 :]
+        # The backend only supports reduction over the last non-1 dimension, so if we want
+        # to reduce over other dimensions we have to permute the tensor first.
+        if not all(isinstance(s, IntImm) and s.value() == 1 for s in tail_shapes):
+            perm_shape = list(range(x._rank()))
+            perm_shape[dim] = x._rank() - 1
+            perm_shape[-1] = dim
+            x_perm = permute()(x, perm_shape)
+            x_perm_softmax = softmax()(x_perm, dim=-1)
+            return permute()(x_perm_softmax, perm_shape)
 
         self._attrs["inputs"] = [x]
         self._attrs["dim"] = dim
         self._set_depth()
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
@@ -257,7 +274,7 @@ def _profile_single_workload(self, profiler_prefix, exec_key, devices):
         )
         cache_value = target.query_profile_cache("normalization", query.__dict__)
         if cache_value is not None and not target.force_profile():
-            logger.info(__name__, "Load profiling result from cache.")
+            _LOGGER.info("Load profiling result from cache.")
             return cache_value
 
         content = list(self._attrs["op_instance"].keys())
@@ -330,8 +347,7 @@ def profile(
             func(self._attrs)
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             best_algo, workspace = self._profile_single_workload(
@@ -383,3 +399,6 @@ def gen_function(self) -> str:
         self._attrs["exec_cond_template"] = EXEC_COND_TEMPLATE
         func = registry.get(func_key)
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return {"dim": self._attrs["dim"]}
diff --git a/python/aitemplate/compiler/ops/tensor/__init__.py b/python/aitemplate/compiler/ops/tensor/__init__.py
index 61ae1c585..ee031c330 100644
--- a/python/aitemplate/compiler/ops/tensor/__init__.py
+++ b/python/aitemplate/compiler/ops/tensor/__init__.py
@@ -16,20 +16,31 @@
 """
 reduce module init
 """
-from .argmax import argmax
-from .batch_gather import batch_gather
-from .chunk import chunk
-from .concatenate import concatenate
-from .concatenate_tanh import concatenate_tanh
-from .dynamic_slice import dynamic_slice
-from .expand import expand
-from .gather import gather
-from .permute import permute
-from .permute021 import permute021
-from .permute102 import permute102
-from .permute210 import permute210
-from .size import size
-from .slice_reshape_scatter import slice_reshape_scatter
-from .slice_scatter import slice_scatter
-from .split import split
-from .topk import topk
+from aitemplate.compiler.ops.tensor.argmax import argmax
+from aitemplate.compiler.ops.tensor.batch_gather import batch_gather
+from aitemplate.compiler.ops.tensor.cast import cast
+from aitemplate.compiler.ops.tensor.chunk import chunk
+from aitemplate.compiler.ops.tensor.concatenate import concatenate
+from aitemplate.compiler.ops.tensor.concatenate_tanh import concatenate_tanh
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.full import full
+from aitemplate.compiler.ops.tensor.gather import gather
+from aitemplate.compiler.ops.tensor.identity import identity
+from aitemplate.compiler.ops.tensor.index_select import index_select
+from aitemplate.compiler.ops.tensor.jagged_to_padded_dense import jagged_to_padded_dense
+from aitemplate.compiler.ops.tensor.masked_select import masked_select
+from aitemplate.compiler.ops.tensor.padded_dense_to_jagged import padded_dense_to_jagged
+from aitemplate.compiler.ops.tensor.permute import permute
+from aitemplate.compiler.ops.tensor.permute021 import permute021
+from aitemplate.compiler.ops.tensor.permute0213 import permute0213
+from aitemplate.compiler.ops.tensor.permute102 import permute102
+from aitemplate.compiler.ops.tensor.permute210 import permute210
+from aitemplate.compiler.ops.tensor.relational import eq, ge, gt, le, lt, ne
+from aitemplate.compiler.ops.tensor.size import size
+from aitemplate.compiler.ops.tensor.slice_reshape_scatter import slice_reshape_scatter
+from aitemplate.compiler.ops.tensor.slice_scatter import slice_scatter
+from aitemplate.compiler.ops.tensor.split import split
+from aitemplate.compiler.ops.tensor.topk import topk
+from aitemplate.compiler.ops.tensor.transpose import transpose
+from aitemplate.compiler.ops.tensor.where import where
diff --git a/python/aitemplate/compiler/ops/tensor/argmax.py b/python/aitemplate/compiler/ops/tensor/argmax.py
index 73e63db01..b8a2d4df0 100644
--- a/python/aitemplate/compiler/ops/tensor/argmax.py
+++ b/python/aitemplate/compiler/ops/tensor/argmax.py
@@ -16,6 +16,7 @@
 Argmax.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -24,13 +25,15 @@
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ....utils import logger, shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_KEY_TEMPLATE = jinja2.Template(
     """
 instance_size == {{x_dim0}} &&  instance_num == {{x_dim1}}
@@ -60,30 +63,9 @@ def __init__(self, dim=0) -> None:
         self._attrs["workspace"] = 0
         self.exec_key_template = EXEC_KEY_TEMPLATE
 
-    def _infer_shape(self, x: List[int]):
-        """Infer the output shape"""
-        output = list(x)[:-1]
-        return output
-
     def _infer_shapes(self, x: Tensor):
         """Infer the output shape"""
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
-        output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                shape_utils.gen_int_var(values=unique([d[idx] for d in y_shapes]))
-            )
-        return output_shape
+        return x._attrs["shape"][:-1]
 
     def __call__(self, x: Tensor) -> Tensor:
         """call the op
@@ -152,7 +134,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -198,8 +180,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/tensor/batch_gather.py b/python/aitemplate/compiler/ops/tensor/batch_gather.py
index a895b2516..f6586affc 100644
--- a/python/aitemplate/compiler/ops/tensor/batch_gather.py
+++ b/python/aitemplate/compiler/ops/tensor/batch_gather.py
@@ -21,10 +21,9 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -53,35 +52,24 @@ def __init__(self) -> None:
         self._attrs["has_profiler"] = False
         self.exec_key_template = EXEC_KEY_TEMPLATE
 
-    def _infer_shape(self, x: List[int], indices: List[int]):
-        rank = len(indices)
-        for r in range(rank - 1):
-            assert x[r] == indices[r]
-        output = list(x)
-        output[rank - 1] = indices[-1]
-        return output
-
     def _infer_shapes(self, x: Tensor, indices: Tensor) -> List[IntVar]:
         """Infers shapes for batch_gather."""
 
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
+        rank = len(indices._attrs["shape"])
+
+        # TODO: remove this when we're sure we support non-static batch_gather
+        x_shape_values = [var._attrs["values"][0] for var in x._attrs["shape"]]
         indices_shape = [var._attrs["values"][0] for var in indices._attrs["shape"]]
-        # run infershape for each
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape, indices_shape)
-            y_shapes.append(y_shape)
+        for r in range(1, rank - 1):
+            assert x_shape_values[r] == indices_shape[r]
 
-        def unique(vector):
-            return sorted(set(vector))
+        out_shapes = x._attrs["shape"][:]
+        if rank <= 1:
+            # Special case: gather happens along batch dimension
+            out_shapes[0] = indices.shape()[0]
+        out_shapes[rank - 1] = indices._attrs["shape"][-1]
 
-        output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                shape_utils.gen_int_var(unique([d[idx] for d in y_shapes]))
-            )
-        return output_shape
+        return out_shapes
 
     def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
         dtype = indices._attrs["dtype"]
@@ -94,7 +82,7 @@ def __call__(self, x: Tensor, indices: Tensor) -> Tensor:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x, indices)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x.dtype())
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/cast.py b/python/aitemplate/compiler/ops/tensor/cast.py
new file mode 100644
index 000000000..1850367a6
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/cast.py
@@ -0,0 +1,81 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class cast(Operator):
+    """
+    Returns the cast of input tensor to specified type.
+    Only the conversion between any pair of float16, bfloat16,
+    and float32 dtypes is supported.
+
+    Args:
+        x (Tensor): the source tensor
+        dtype (str): the target type for the cast operator
+
+    Returns:
+        Tensor: a tensor with the type converted to the
+        specified dtype.
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self._attrs["op"] = "cast"
+        self._attrs["has_profiler"] = False
+
+    def __call__(
+        self,
+        x: Tensor,
+        dtype: str = "bfloat16",
+    ) -> Tensor:
+        x_dtype = normalize_dtype(x._attrs["dtype"])
+        dtype = normalize_dtype(dtype)
+        if x_dtype not in ("float16", "bfloat16", "float32"):
+            raise TypeError(
+                f"Expected dtype for x must be float16,bfloat16 or float32 , but got {x_dtype}."
+            )
+
+        if dtype not in ("float16", "bfloat16", "float32"):
+            raise TypeError(
+                f"Expected dtype to cast must be float16,bfloat16 or float32 , but got {dtype}."
+            )
+        if dtype == x_dtype:
+            return x
+
+        self._attrs["inputs"] = [x]
+        self._attrs["cast_dtype"] = dtype
+        self._set_depth()
+
+        output_shape = x._attrs["shape"]
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=dtype,
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/chunk.py b/python/aitemplate/compiler/ops/tensor/chunk.py
index 78e5d7f21..786a62177 100644
--- a/python/aitemplate/compiler/ops/tensor/chunk.py
+++ b/python/aitemplate/compiler/ops/tensor/chunk.py
@@ -19,8 +19,8 @@
 
 from typing import List
 
-from ...base import Tensor
-from .split import split
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.tensor.split import split
 
 
 class chunk(split):
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate.py b/python/aitemplate/compiler/ops/tensor/concatenate.py
index a7669d969..eaa544863 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate.py
@@ -15,14 +15,15 @@
 """
 Concatenate.
 """
+from functools import reduce
 from typing import List, Sequence, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntVar, Operator, Tensor
-from ...tensor_accessor import TensorAccessor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
@@ -41,17 +42,24 @@ class concatenate(Operator):
 
     """
 
-    def __init__(self) -> None:
+    def __init__(self, fast_cat=True) -> None:
+        # TMP: note that fast_cat is a temporary flag to force backend to select
+        # the fast concat implementation. After we finish benchmark fast concat,
+        # we should remove this flag. Instead, we will rely on backend to dispatch
+        # to the appropriate implementation based on input shapes if the fast
+        # concat couldn't handle all cases. If the fast concat is complete, we
+        # can remove the old concat kernel.
         super().__init__()
         self._attrs["op"] = "concatenate"
         self._attrs["has_profiler"] = False
+        self._attrs["fast_cat"] = fast_cat
 
     def _unique(self, vector):
         return sorted(set(vector))
 
-    def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
-        """Infers shapes for concatenate."""
-
+    @staticmethod
+    def check_rank(inputs: List[Tensor], dim) -> bool:
+        """check if the rank is valid"""
         if len(inputs) < 1:
             raise RuntimeError("expected a list of Tensors")
         x = inputs[0]
@@ -60,18 +68,20 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             raise RuntimeError("expected a non-scalar tensor")
         if dim >= rank:
             raise RuntimeError(
-                "concat_dim ({dim}) expected to be less than rank ({rank})".format(
-                    dim=dim, rank=rank
-                )
+                f"concat_dim ({dim}) expected to be less than rank ({rank})"
             )
         for t in inputs:
             r = len(t._attrs["shape"])
             if r != rank:
                 raise RuntimeError(
-                    "tensors expected to have the same rank, got {} and {}".format(
-                        r, rank
-                    )
+                    f"tensors expected to have the same rank but got {rank=} "
+                    f'and {r=} for tensor {t._attrs["name"]}'
                 )
+
+    def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
+        """Infers shapes for concatenate."""
+        concatenate.check_rank(inputs, dim)
+
         input_shapes = [i._attrs["shape"] for i in inputs]
         output_shape = []
         input_shape_values = [
@@ -81,13 +91,22 @@ def _infer_shapes(self, inputs: List[Tensor], dim) -> List[IntVar]:
             if idx == dim:
                 min_value_sum = sum(value[0] for value in lst)
                 max_value_sum = sum(value[-1] for value in lst)
-                output_shape.append(
-                    shape_utils.gen_int_var([min_value_sum, max_value_sum])
+                sym_val = reduce(
+                    lambda x, y: x + y,
+                    [
+                        input_shape[idx]._attrs["symbolic_value"]
+                        for input_shape in input_shapes
+                    ],
                 )
+                shape_var = shape_utils.gen_int_var(
+                    [min_value_sum, max_value_sum], symbolic_value=sym_val
+                )
+                output_shape.append(shape_var)
             else:
                 output_dim = input_shapes[0][idx]
                 for shape in input_shapes:
-                    if output_dim != shape[idx]:
+                    # if output_dim != shape[idx]:
+                    if output_dim._attrs["values"] != shape[idx]._attrs["values"]:
                         raise RuntimeError(
                             "tensors expected to have the same dimensions "
                             "except concat_dim! dim: {}, shape1: {}, shape2: {}, inputs: {}".format(
@@ -252,4 +271,7 @@ def remove_input_at(self, indices: Union[int, Sequence[int]]) -> None:
         self._attrs["input_accessors"] = new_input_accessors
 
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [f"dim={self._attrs['concat_dim']}"]
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [f"dim={self._attrs['concat_dim']}"]
diff --git a/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
index 08cdacaa7..7ecaf757f 100644
--- a/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
+++ b/python/aitemplate/compiler/ops/tensor/concatenate_tanh.py
@@ -15,7 +15,7 @@
 """
 Concatenate_tanh
 """
-from . import concatenate
+from aitemplate.compiler.ops.tensor import concatenate
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
index 774c4418a..547382913 100644
--- a/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
+++ b/python/aitemplate/compiler/ops/tensor/dynamic_slice.py
@@ -15,13 +15,14 @@
 """
 Dynamic_slice.
 """
-import itertools
 from typing import List, Optional, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import IntVar, IntVarTensor, Operator, Tensor
+import sympy
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221
 
@@ -69,15 +70,28 @@ def normalize_start_end_indices(dim_val: int, start: int, end: int) -> List[int]
         start = end if start > end else start
         return [start, end]
 
-    def _infer_shape(
-        self, x_shape: List[int], start_indices: List[int], end_indices: List[int]
-    ) -> List[int]:
-        y_shape = []
-        for dim_val, start, end in zip(x_shape, start_indices, end_indices):
-            # handle negative indices
-            start, end = dynamic_slice.normalize_start_end_indices(dim_val, start, end)
-            y_shape.append(end - start)
-        return y_shape
+    def _infer_dynamic_dim(self, dim: IntVar, start_index: int, end_index: int):
+        values = dim._attrs["values"]
+        new_values = []
+
+        for value in values:
+            start, end = dynamic_slice.normalize_start_end_indices(
+                value, start_index, end_index
+            )
+            new_values.append(end - start)
+        new_values = sorted(set(new_values))
+
+        start_sym = (
+            start_index if start_index >= 0 else dim.symbolic_value() + start_index
+        )
+        end_sym = end_index if end_index >= 0 else dim.symbolic_value() + end_index
+
+        start_sym = sympy.Min(dim.symbolic_value(), sympy.Max(0, start_sym))
+        end_sym = sympy.Min(dim.symbolic_value(), sympy.Max(0, end_sym))
+
+        symbolic_value = sympy.Max(0, end_sym - start_sym)
+
+        return shape_utils.gen_int_var(new_values, symbolic_value=symbolic_value)
 
     def _infer_shapes(
         self,
@@ -86,25 +100,26 @@ def _infer_shapes(
         end_indices: List[Union[IntVar, IntVarTensor, Optional[int]]],
     ) -> List[IntVar]:
         """Infers shape for dynamic_slice."""
+        # TODO: Handle start_indices/end_indices that are not int.
 
         x_shape = x._attrs["shape"]
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        y_shapes = []
-        for x_shape in x_shapes:
-            y_shape = self._infer_shape(x_shape, start_indices, end_indices)
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
         output_shape = []
-        for idx in range(len(y_shapes[0])):
-            output_shape.append(
-                x._attrs["shape"][idx]
-                if (start_indices[idx] == 0 and end_indices[idx] == MAX_INT32)
-                else shape_utils.gen_int_var(unique(d[idx] for d in y_shapes))
-            )
+        for dim_val, start, end in zip(x_shape, start_indices, end_indices):
+            if start == 0 and end == MAX_INT32:
+                # Slicing along the whole dim.
+                output_shape.append(dim_val)
+            elif isinstance(dim_val, IntImm):
+                # Slicing a static dimension.
+                start, end = dynamic_slice.normalize_start_end_indices(
+                    dim_val.value(), start, end
+                )
+                output_shape.append(IntImm(end - start))
+            elif start >= 0 and end >= 0:
+                # Fixed size from start and end.
+                output_shape.append(IntImm(end - start))
+            else:
+                output_shape.append(self._infer_dynamic_dim(dim_val, start, end))
+
         return output_shape
 
     def __call__(
@@ -180,7 +195,10 @@ def gen_function(self) -> str:
         return func(self._attrs)
 
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [
             f"start_indices=[{self._pseudo_code_helper(self._attrs['start_indices'], with_shape=True)}]",
             f"end_indices=[{self._pseudo_code_helper(self._attrs['end_indices'], with_shape=True)}]",
         ]
diff --git a/python/aitemplate/compiler/ops/tensor/expand.py b/python/aitemplate/compiler/ops/tensor/expand.py
index b4a436a87..00cf0621a 100644
--- a/python/aitemplate/compiler/ops/tensor/expand.py
+++ b/python/aitemplate/compiler/ops/tensor/expand.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+from enum import IntEnum
 from typing import List, Union
 
 from aitemplate.backend import registry
@@ -19,7 +20,7 @@
 from aitemplate.backend.target import Target
 
 from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
-from aitemplate.utils.shape_utils import convert_shape_to_IntVar, gen_int_var
+from aitemplate.utils.shape_utils import convert_shape_to_IntVar
 
 
 def _normalize_dim(dim: IntVar) -> IntVar:
@@ -35,6 +36,12 @@ def _dim_has_value(dim: IntVar, value: int) -> bool:
     return isinstance(dim, IntImm) and dim.value() == value
 
 
+class ExpandDimensionType(IntEnum):
+    ADD_DIM = 0
+    EXPAND_DIM = 1
+    KEEP_DIM = 2
+
+
 class expand(Operator):
     """
     Expands a tensor's singleton dimensions.
@@ -44,12 +51,17 @@ class expand(Operator):
     The output shape may be dynamic.
 
     The other dimensions in the input must match the input shape exactly,
-    or be set to -1.
+    or be set to -1, in which case the output shape is unchanged for that dimension.
+
+    Tensor can be also expanded to a larger number of dimensions, and the new ones will
+    be appended at the front. For the new dimensions, the size cannot be set to -1.
 
     Args:
         input (Tensor) : the source tensor
-        dim (List[Union[IntImm, IntVar, int]]) : the target dim
-
+        shape (List[Union[IntImm, IntVar, int]]) : target shape ( dimensions with size -1 will be kept, excess dimensions are added at the front )
+        index_type (str): Native type used for indices, may be "int64" (default) or "int32".
+                          Pick "int32" only if the total number of elements is lower than 2^31
+        optimize_fixed_dims (bool) : if True, and if the conditions are given, allow to apply optimizatins assuming mostly fixed shapes.
     Returns:
         Tensor : the destination tensor
 
@@ -75,55 +87,94 @@ def __init__(self):
     def _should_reuse_input_dim(dim_tensor: IntVar, dim_arg: IntVar) -> bool:
         return _dim_has_value(dim_arg, -1) or dim_tensor == dim_arg
 
-    def _infer_shape(self, tensor: Tensor, shape: List[IntVar]) -> List[IntVar]:
+    def _infer_shape(self, tensor: Tensor, target_shape: List[IntVar]) -> List[IntVar]:
         output_shape = []
         input_shape = tensor._attrs["shape"]
+        assert len(input_shape) > 0, "Input tensor must have a shape of length > 0"
+        for i, dim in enumerate(input_shape):
+            if dim.lower_bound() < 0:
+                raise ValueError(
+                    f"Dimension {i} of expand input tensor shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes negative values."
+                )
+        for i, dim in enumerate(target_shape):
+            if dim.lower_bound() < 0 and not (
+                dim.lower_bound() == -1 and dim.upper_bound() == -1
+            ):
+                raise ValueError(
+                    f"Dimension {i} of expand target shape has range [{dim.lower_bound()}:{dim.upper_bound()}], which includes negative values."
+                )
 
-        if len(shape) != len(input_shape):
+        if len(target_shape) < len(input_shape):
             raise ValueError(
-                f"Input shape ndim ({len(shape)}) must match tensor's ndim ({len(input_shape)})"
+                f"Target shape length ({len(target_shape)}) must be greater or equal to input tensor's shape length ({len(input_shape)})"
             )
-
-        for i, dim_tensor in enumerate(input_shape):
-            dim_arg = shape[i]
+        add_ndims = len(target_shape) - len(input_shape)
+        for i, dim_to_add in enumerate(target_shape[:add_ndims]):
+            if dim_to_add.lower_bound() <= 0:
+                raise ValueError(
+                    f"Output shape dimension {i} to be added has value range [{dim_to_add.lower_bound()}:{dim_to_add.upper_bound()}], but violates constraint that it must be greater or equal to 1."
+                )
+            output_shape.append(dim_to_add)
+        self._attrs["dim_types"] = [
+            ExpandDimensionType.ADD_DIM
+        ] * add_ndims  # 0 meaning, dimension is added
+        for i, dim_input in enumerate(input_shape):
+            dim_target = target_shape[i + add_ndims]
 
             # Convert IntVars with the same upper and lower bounds to IntImm's.
             # This lets us tell that expanding IntImm(1) into IntVar([1, 1]) is
             # actually a no-op.
-            dim_tensor = _normalize_dim(dim_tensor)
-            dim_arg = _normalize_dim(dim_arg)
+            dim_input = _normalize_dim(dim_input)
+            dim_target = _normalize_dim(dim_target)
 
-            if self._should_reuse_input_dim(dim_tensor, dim_arg):
-                output_shape.append(
-                    gen_int_var(
-                        dim_tensor._attrs["values"], name=dim_tensor._attrs["name"]
-                    )
-                )
-            elif _dim_has_value(dim_tensor, 1):
-                if self._attrs["expand_dim"] is not None:
-                    raise NotImplementedError(
-                        f"Expand only supports expanding one dim. Tried to expand dim {i}, but already expanded dim {self._attrs['expand_dim']}."
-                    )
-                self._attrs["expand_dim"] = i
+            if self._should_reuse_input_dim(dim_input, dim_target):
                 output_shape.append(
-                    gen_int_var(dim_arg._attrs["values"], name=dim_arg._attrs["name"])
-                )
+                    dim_input
+                )  # no deepcopy, dim symbol should be identical
+                self._attrs["dim_types"].append(
+                    ExpandDimensionType.KEEP_DIM
+                )  # 2 meaning, dimension is kept as is
+            elif _dim_has_value(dim_input, 1):
+                output_shape.append(dim_target)
+                self._attrs["dim_types"].append(
+                    ExpandDimensionType.EXPAND_DIM
+                )  # 1 meaning, dimension is expanded
             else:
                 raise ValueError(
-                    f"Tried to expand non-singleton dimension {i}. Input tensor dim: {dim_tensor}, target shape dim: {dim_arg}"
+                    f"Tried to expand non-singleton dimension {i}. Input tensor dim: {dim_input}, target shape dim: {dim_target}"
                 )
-
+        head_dim_count = 0
+        head_size = 1
+        for dim_type, dim in zip(self._attrs["dim_types"], output_shape):
+            if dim_type == ExpandDimensionType.KEEP_DIM and dim.lower_bound() != 1:
+                break
+            head_size *= dim.lower_bound()
+            head_dim_count += 1
+        self._attrs["head_dim_count"] = head_dim_count
+        self._attrs["head_size"] = head_size
+        self._attrs["non_head_dims_are_fixed"] = all(
+            dim.lower_bound() == dim.upper_bound() for dim in output_shape[add_ndims:]
+        )
         return output_shape
 
     def __call__(
-        self, tensor: Tensor, shape: List[Union[int, IntVar, IntVarTensor]]
+        self,
+        tensor: Tensor,
+        shape: List[Union[int, IntVar, IntVarTensor]],
+        index_type="int64",
+        optimize_fixed_dims=True,
     ) -> Tensor:
         self._attrs["inputs"] = [tensor]
+        self._attrs["index_type"] = index_type
+        self._attrs["optimize_fixed_dims"] = optimize_fixed_dims
         for dim in shape:
             if isinstance(dim, IntVarTensor):
                 self._attrs["inputs"].append(dim)
         shape = convert_shape_to_IntVar(shape)
+        if index_type not in ["int64", "int32"]:
+            raise ValueError("index_type for expand op has to be int64_t or int32_t")
         self._set_depth()
+
         output_shape = self._infer_shape(tensor, shape)
         output = Tensor(output_shape, src_ops={self}, dtype=tensor._attrs["dtype"])
         self._attrs["outputs"] = [output]
diff --git a/python/aitemplate/compiler/ops/tensor/full.py b/python/aitemplate/compiler/ops/tensor/full.py
new file mode 100644
index 000000000..9d6addd3f
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/full.py
@@ -0,0 +1,78 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import List
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import get_dtype_size
+
+
+class full(Operator):
+    """
+    Creates a tensor of a given `shape` and `dtype` filled
+    with the specified `fill_value` (float scalar).
+
+    Args:
+        shape (int or IntVar or List[IntVar]): the shape of the output Tensor.
+        fill_value (int or float): the value to fill the output Tensor with.
+        dtype (str): the dtype of the output Tensor.
+
+    Returns:
+        Tensor: a tensor of `shape` and `dtype` filled with `fill_value`.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+        self._attrs["op"] = "full"
+        self._attrs["has_profiler"] = False
+
+    def __call__(
+        self,
+        shape: List[IntVar],
+        fill_value: float,
+        dtype: str = "float16",
+    ) -> Tensor:
+        if isinstance(shape, (int, IntVar)):
+            shape = [shape]
+        if not isinstance(shape, (list, tuple)):
+            raise TypeError(f"shape must be List[IntVar], but got {shape}.")
+        shape = list(shape)
+        static_shape = all([isinstance(s, (int, IntImm)) for s in shape])
+
+        if not isinstance(fill_value, (int, float)):
+            raise TypeError(f"fill_value must be a scalar, but got {fill_value}.")
+        fill_value = float(fill_value)
+
+        # validation inside
+        get_dtype_size(dtype)
+
+        self._attrs["inputs"] = []
+        self._attrs["fill_value"] = fill_value
+
+        self._set_depth()
+        output = Tensor(
+            shape, src_ops={self}, dtype=dtype, skip_constant_folding=not static_shape
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/gather.py b/python/aitemplate/compiler/ops/tensor/gather.py
index 867962a28..4a2b4d131 100644
--- a/python/aitemplate/compiler/ops/tensor/gather.py
+++ b/python/aitemplate/compiler/ops/tensor/gather.py
@@ -15,9 +15,9 @@
 """
 Operator definition for gather.
 """
-from .... import backend
-from ....backend import registry
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
@@ -55,7 +55,11 @@ def __call__(self, x: Tensor, dim: int, index: Tensor) -> Tensor:
         self._set_depth()
 
         output_shape = index._attrs["shape"]
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/identity.py b/python/aitemplate/compiler/ops/tensor/identity.py
new file mode 100644
index 000000000..262a32523
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/identity.py
@@ -0,0 +1,55 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+identity op
+"""
+from typing import List
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+class identity(Operator):
+    """
+    Returns the input tensor. This could be useful for only name changes etc.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "identity"
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        return x.shape()
+
+    def __call__(self, x: Tensor) -> Tensor:
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+
+        output_shapes = self._infer_shapes(x)
+        output = Tensor(output_shapes, src_ops={self}, is_view_of=x)
+        self._attrs["outputs"] = [output]
+
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/index_select.py b/python/aitemplate/compiler/ops/tensor/index_select.py
new file mode 100644
index 000000000..71f023572
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/index_select.py
@@ -0,0 +1,88 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select op
+"""
+
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import Operator, Tensor
+
+
+class index_select(Operator):
+    """
+    Returns a new tensor which indexes the input tensor
+    along dimension dim using the entries in index which is a LongTensor.
+
+    The returned tensor has the same number of dimensions as the original tensor (input).
+    The dimth dimension has the same size as the length of index;
+    other dimensions have the same size as in the original tensor.
+
+    Args:
+        input (Tensor) – the input tensor.
+        dim (int) – the dimension in which we index
+        index (IntTensor or LongTensor) – the 1-D tensor containing the indices to index
+    """
+
+    def __init__(self, dim=0):
+        super().__init__()
+        self._attrs["op"] = "index_select"
+        self._attrs["dim"] = dim
+
+    def _normalize_dim(self, rank: int):
+        dim_idx = self._attrs["dim"]
+        orig = dim_idx
+        if dim_idx < 0:
+            dim_idx = rank + dim_idx
+        if dim_idx < 0 or dim_idx >= rank:
+            raise RuntimeError(
+                f"Invalid dim for index_select. Valid values of dim range from {-rank} to {rank - 1}. {orig} provided, normalized {dim_idx}"
+            )
+        self._attrs["dim"] = dim_idx
+
+    def _infer_shape(self, x: Tensor, idx_select_dim):
+        self._normalize_dim(len(x._attrs["shape"]))
+        dim_idx = self._attrs["dim"]
+        dims = x._attrs["shape"][:dim_idx]
+        dims += [idx_select_dim]
+        if dim_idx + 1 < len(x._attrs["shape"]):
+            dims += x._attrs["shape"][dim_idx + 1 :]
+        return dims
+
+    def __call__(
+        self,
+        x: Tensor,
+        dim_idxs: Tensor,
+    ) -> List[Tensor]:
+        self._attrs["inputs"] = [x, dim_idxs]
+        if len(dim_idxs._attrs["shape"]) != 1:
+            raise RuntimeError("index tensor must be 1 dimensional.")
+        self._set_depth()
+        output = Tensor(
+            self._infer_shape(x, (dim_idxs._attrs["shape"][0])),
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py b/python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py
new file mode 100644
index 000000000..a9dbf4aa9
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/jagged_to_padded_dense.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+Define jagged_to_padded_dense op
+"""
+import logging
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class jagged_to_padded_dense(Operator):
+    """
+    Returns a dense Tensor "expanded" from the input jagged Tensor.
+    For each of the jagged dimensions (JaggedDims) in the jagged
+    Tensor's first dimension (JaggedIntVar), a separate static
+    dimension (IntImm) equal to the max_value of the jagged
+    dimension is created in the output dense Tensor's shape.
+
+    The values in the output dense Tensor that don't have corresponding
+    values in the input jagged Tensor are set to the padding_value.
+
+    Args:
+        x (Tensor): input jagged Tensor.
+        padding_value (float): the padding value for the output dense
+            Tensor's elements that don't have counterparts in the input
+            jagged Tensor.
+    Returns:
+        y (Tensor): a dense Tensor expanded from the input jagged Tensor x.
+    """
+
+    def __init__(
+        self,
+        padding_value: float = 0,
+    ):
+        super().__init__()
+        self._attrs["op"] = "jagged_to_padded_dense"
+        self._attrs["padding_value"] = padding_value
+
+    def _infer_shape(self, x: Tensor) -> List[IntVar]:
+        jagged_int_var = x.shape()[0]
+        inner_shape = x.shape()[1:]
+        return jagged_int_var.get_max_dense_shape() + inner_shape
+
+    def _get_op_attributes(self):
+        return {
+            "padding_value": self._attrs["padding_value"],
+        }
+
+    def _args_for_pseudo_code(self):
+        return [f"padding_value={self._attrs['padding_value']}"]
+
+    def __call__(
+        self,
+        x: Tensor,
+    ) -> Tensor:
+        if not x.is_jagged():
+            raise RuntimeError("Input tensor x must be jagged.")
+
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shape(x)
+        y = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+
+        self._attrs["outputs"] = [y]
+        return y
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/masked_select.py b/python/aitemplate/compiler/ops/tensor/masked_select.py
new file mode 100644
index 000000000..bc994f4fb
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/masked_select.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Define masked_select op
+"""
+import logging
+from typing import List
+
+from aitemplate.backend import registry
+
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class masked_select(Operator):
+    """
+    Returns a 1D tensor containing elements of the input tensor selected by the boolean mask,
+    similar to `torch.masked_select`.
+
+    Args:
+        input (Tensor): the source tensor.
+        mask (Tensor, boolean): has to be of same shape as input.
+
+    Returns:
+        output: 1D tensor of length equal to the total number of elements in `input`. The result
+            is contained in the first `num_nonmasked` elements of output. The rest of the output
+            tensor is not meaningful.
+        num_nonmasked: number of the non-masked elements in the input, i.e. the length of the
+            significant part of output.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "masked_select"
+        self._attrs["workspace"] = 0
+
+    def _infer_shape(self, x: Tensor, mask: Tensor) -> List[IntVar]:
+        input_shape = x._attrs["shape"]
+        mask_shape = mask._attrs["shape"]
+        if input_shape != mask_shape:
+            raise RuntimeError(
+                "Tensor shapes of input and mask are not equal! Shape1: {}, shape2: {}".format(
+                    input_shape, mask_shape
+                )
+            )
+
+        numel = 1
+        for dim in input_shape:
+            numel *= dim.upper_bound()
+        # Output size can range from 0 (when all mask elements are False) to the total number of
+        # elements in the input (when all mask elements are True).
+        return [IntVar(values=(0, numel))]
+
+    def __call__(
+        self,
+        x: Tensor,
+        mask: Tensor,
+    ) -> List[Tensor]:
+        dtype = mask._attrs["dtype"]
+        if dtype != "bool":
+            raise RuntimeError("Expected mask of dtype bool, but got {}".format(dtype))
+        self._attrs["inputs"] = [x, mask]
+        self._set_depth()
+        output_shape = self._infer_shape(x, mask)
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+
+        self._attrs["outputs"] = [output]
+        # Allocate temporary buffer. This empirical formula for size is deduced by looking at buffer sizes
+        # requested by cub::DeviceSelect::Flagged for differen input sizes. Required buffer size depends on
+        # the number of input elements and on the GPU architecture, but not on the input data type.
+        self._attrs["workspace"] = output_shape[0].upper_bound() // 128 + 1024
+        _LOGGER.debug(
+            f'Allocating {self._attrs["workspace"]} bytes for temporary buffer of masked_select op'
+        )
+        return output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
new file mode 100644
index 000000000..8010454eb
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/padded_dense_to_jagged.py
@@ -0,0 +1,134 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""
+The front-end definition of the padded_dense_to_jagged op.
+"""
+from typing import List
+
+from aitemplate.backend import registry
+from aitemplate.backend.target import Target
+from aitemplate.compiler.base import IntVar, JaggedDim, JaggedIntVar, Operator, Tensor
+from aitemplate.compiler.ops import make_jagged
+
+
+class padded_dense_to_jagged(Operator):
+    """
+    Returns a jagged Tensor "extracted" from the input dense Tensor,
+    given the offsets list. The resulting jagged Tensor contains the
+    subset of values of the input dense Tensor specified by the rank-1
+    offset Tensors in the offsets_list.
+
+    Args:
+        x (Tensor): input dense tensor.
+        offsets_list (List[Tensor]): the list of offsets of the resulting jagged Tensor.
+        total_length (IntVar): the total length dimension of the resulting jagged Tensor.
+    Returns:
+        y (Tensor): a jagged Tensor extracted from the input dense Tensor x.
+    """
+
+    def __init__(
+        self,
+        total_length: IntVar,
+    ):
+        if isinstance(total_length, JaggedIntVar):
+            # the total_length dimension may be fetched from the
+            # jagged tensor's shape[0]. in such cases, the total_length
+            # would already be a JaggedIntVar and we fetch the real
+            # total_length from inside it.
+            total_length = total_length.total_length()
+        if type(total_length) != IntVar:
+            raise TypeError(
+                f"total_length must be IntVar, but got {type(total_length).__name__}."
+            )
+
+        super().__init__()
+        self._attrs["op"] = "padded_dense_to_jagged"
+        self._attrs["total_length"] = total_length
+
+    def _infer_shape(
+        self,
+        x: Tensor,
+        offsets_list: List[Tensor],
+    ) -> List[IntVar]:
+        inner_shape = x.shape()[1 + len(offsets_list) :]
+        return [self._attrs["total_length"]] + inner_shape
+
+    def _get_op_attributes(self):
+        return {
+            "total_length": self._attrs["total_length"],
+        }
+
+    def _args_for_pseudo_code(self):
+        return [f"total_length={self._attrs['total_length']}"]
+
+    def __call__(
+        self,
+        x: Tensor,
+        offsets_list: List[Tensor],
+    ) -> Tensor:
+        x_shape = x.shape()
+        if not offsets_list:
+            raise ValueError("At least one offsets Tensor must be specified.")
+        if len(x_shape) < len(offsets_list) + 2:
+            raise ValueError(
+                "The input dense Tensor x must have at least len(offsets_list) + 2 dimensions: "
+                "one batch dimension, as many sequence dimensions as len(offsets_list), and "
+                f"at least one inner dimension, but {len(offsets_list)=}, {x_shape=}."
+            )
+        if type(x_shape[0]) != IntVar:
+            raise TypeError(
+                f"x.shape()[0] must be IntVar, but got {type(x_shape[0]).__name__}."
+            )
+
+        self._attrs["inputs"] = [x, *offsets_list]
+        self._set_depth()
+        output_shape = self._infer_shape(x, offsets_list)
+
+        # the source Tensor of the resulting jagged Tensor
+        source = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        self._attrs["outputs"] = [source]
+
+        # in the AIT graph, the output of the padded_dense_to_jagged op is set to
+        # the source Tensor, which is still not a jagged Tensor. The source Tensor
+        # is passed through the make_jagged op to obtain the jagged Tensor returned
+        # from the __call__: this way, the chain of ops in the graph looks like:
+        #
+        #      x --> padded_dense_to_jagged --> source --> make_jagged --> y
+        #                    \--------- offsets_list ----------/
+
+        # the resulting jagged Tensor
+        jagged_output = make_jagged(
+            batch_dim=x_shape[0],
+            jagged_dims=[
+                JaggedDim(min_value=0, max_value=dim)
+                for dim in x_shape[1 : 1 + len(offsets_list)]
+            ],
+        )(
+            source=source,
+            offsets_list=offsets_list,
+        )
+
+        # we keep the resulting jagged Tensor's JaggedIntVar around,
+        # as we'll need it for the back-end code generation of the
+        # padded_dense_to_jagged op
+        self._attrs["jagged_int_var"] = jagged_output._attrs["shape"][0]
+
+        return jagged_output
+
+    def gen_function(self) -> str:
+        target = Target.current()
+        func = registry.get(f"{target.name()}.{self._attrs['op']}.gen_function")
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/tensor/permute.py b/python/aitemplate/compiler/ops/tensor/permute.py
index a992b0dce..c4a9049d6 100644
--- a/python/aitemplate/compiler/ops/tensor/permute.py
+++ b/python/aitemplate/compiler/ops/tensor/permute.py
@@ -17,13 +17,14 @@
 """
 from typing import List, Sequence
 
-from .... import backend
-from ....backend import registry
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntVar, Operator, Tensor
-from .permute021 import permute021
-from .permute102 import permute102
-from .permute210 import permute210
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.permute021 import permute021
+from aitemplate.compiler.ops.tensor.permute0213 import permute0213
+from aitemplate.compiler.ops.tensor.permute102 import permute102
+from aitemplate.compiler.ops.tensor.permute210 import permute210
+from aitemplate.utils.tensor_utils import wrap_dim
 
 
 class permute(Operator):
@@ -49,6 +50,16 @@ def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
         for i, dim in enumerate(dims):
             dims[i] = wrap_dim(dim, x._rank())
 
+        sorted_dims = list(range(x._rank()))
+        assert (
+            sorted(dims) == sorted_dims
+        ), f"expected a permutation of {sorted_dims}, but got {dims}"
+
+        # "dims" is set here before possible dispatching to the
+        # static-shape permute kernels below to keep the call to
+        # ops.permute(..., dims) recoverable from the self._attrs
+        self._attrs["dims"] = dims
+
         if dims == [0, 2, 1]:
             return permute021()(x)
         if dims == [1, 0, 2]:
@@ -56,13 +67,35 @@ def __call__(self, x: Tensor, dims: Sequence[int]) -> Tensor:
         if dims == [2, 1, 0]:
             return permute210()(x)
 
-        self._attrs["dims"] = dims
+        if dims == [0, 2, 1, 3]:
+            second_dim = x.shape()[1]
+            if (isinstance(second_dim, IntImm) and second_dim.value() >= 24) or (
+                isinstance(second_dim, IntVar) and second_dim.lower_bound() >= 24
+            ):
+                # for (0, 2, 1, 3) dims, we dispatch to the permute0213 op
+                # when the second dim >= 24 due to a better performance
+                return permute0213()(x)
+
+        last_dim = x.shape()[-1]
+        if (
+            len(dims) > 3
+            and dims[:-2] + [dims[-1], dims[-2]] == sorted_dims
+            and (
+                (isinstance(last_dim, IntImm) and last_dim.value() >= 8)
+                or (isinstance(last_dim, IntVar) and last_dim.lower_bound() >= 8)
+            )
+        ):
+            # when swapping the last two dims and the last_dim >= 8, we
+            # dispatch to the permute021 op due to a better performance
+            return permute021()(x)
+
         self._attrs["inputs"] = [x]
         self._set_depth()
 
         output_shapes = self._infer_shapes(x)
         output = Tensor(output_shapes, src_ops={self})
         self._attrs["outputs"] = [output]
+        output._attrs["dtype"] = x.dtype()
 
         # TODO: support output TensorAccessor
         return output
diff --git a/python/aitemplate/compiler/ops/tensor/permute021.py b/python/aitemplate/compiler/ops/tensor/permute021.py
index 25b3eedec..d775db8bc 100644
--- a/python/aitemplate/compiler/ops/tensor/permute021.py
+++ b/python/aitemplate/compiler/ops/tensor/permute021.py
@@ -17,43 +17,23 @@
 """
 from typing import List
 
-import jinja2
-
-from .... import backend
-from ....backend import registry
-from ...base import IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221
 
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
-{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
-{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
-{{indent}}{{dtype}}Y_DIM0 = X_DIM0;
-{{indent}}{{dtype}}Y_DIM1 = X_DIM2;
-{{indent}}{{dtype}}Y_DIM2 = X_DIM1;
-"""
-)
-
-SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{y_dim0}} = Y_DIM0;
-{{indent}}{{y_dim1}} = Y_DIM1;
-{{indent}}{{y_dim2}} = Y_DIM2;
-"""
-)
-
 
 class permute021(Operator):
     """
-    Permutes the input tensor from (B, N, M) to (B, M, N).
+    Permutes the input tensor from (B1, B2, ..., Bn, N, M) to (B1, B2, ..., Bn, M, N).
 
     Args:
-        input (Tensor[B, N, M]): the source tensor with 3 dimensions
+        input (Tensor[B1, B2, ..., Bn, N, M]): the source tensor with 3 dimensions
 
     Returns:
-        output (Tensor[B, M, N]): the destination tensor
+        output (Tensor[B1, B2, ..., Bn, M, N]): the destination tensor
 
     Example:
 
@@ -72,20 +52,21 @@ class permute021(Operator):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "permute021"
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
-        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
 
     def _infer_shapes(self, x: Tensor) -> List[IntVar]:
         """Infers shapes for permute021."""
-
         x_shape = x._attrs["shape"]
-        return [x_shape[0], x_shape[2], x_shape[1]]
+        return x_shape[:-2] + [x_shape[-1], x_shape[-2]]
 
     def __call__(self, x: Tensor) -> Tensor:
+        assert len(x.shape()) > 2, "The input tensor must have at least 3 dimensions"
+
         self._attrs["inputs"] = [x]
+        self._attrs["input_accessors"] = [TensorAccessor(x)]
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
@@ -99,6 +80,4 @@ def gen_function(self) -> str:
         return func(
             self._attrs,
             template_path,
-            self.shape_eval_template,
-            self.shape_save_template,
         )
diff --git a/python/aitemplate/compiler/ops/tensor/permute0213.py b/python/aitemplate/compiler/ops/tensor/permute0213.py
new file mode 100644
index 000000000..b6b33c10d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/permute0213.py
@@ -0,0 +1,96 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Permute(0, 2, 1, 3) op.
+Change the dimensions dim1 and dim2 of input 4d tensor.
+"""
+from typing import List
+
+from aitemplate import backend
+
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntVar, Operator, Tensor
+
+# pylint: disable=C0103,W0221
+
+
+class permute0213(Operator):
+    """
+    Permutes the input 4d tensor from (B, N, M, K) to (B, M, N, K).
+
+    Args:
+        input (Tensor[B, N, M, K]): the source tensor with 3 dimensions
+
+    Returns:
+        output (Tensor[B, M, N, K]): the destination tensor
+
+    Example:
+
+        .. highlight:: python
+        .. code-block:: python
+
+            X = Tensor(shape=[2, 384, 262, 10], name="X", is_input=True)
+            Y = ops.permute0213()(X)
+            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            print(y_shape)
+
+            Outs:
+            [2, 262, 384, 10]
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._attrs["op"] = "permute0213"
+
+    def _infer_shapes(self, x: Tensor) -> List[IntVar]:
+        """Infers shapes for permute0213."""
+
+        x_shape = x._attrs["shape"]
+        return [x_shape[0], x_shape[2], x_shape[1], x_shape[3]]
+
+    def __call__(self, x: Tensor) -> List[Tensor]:
+        """
+        Parameters
+        ----------
+        x : Tensor
+
+        Returns
+        -------
+        Tensor
+            Generate output tensors of function calls.
+            In permute0213, its a 4d tensor with d0,d2,d1,d3 of
+            input Tensor.
+        """
+        self._attrs["inputs"] = [x]
+        self._set_depth()
+        output_shape = self._infer_shapes(x)
+        output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        """Generate function body."""
+        target = backend.target.Target.current()
+        template_path = target.template_path()
+        func_key = "{target}.{op}.gen_function".format(
+            target=target.name(), op=self._attrs["op"]
+        )
+        func = registry.get(func_key)
+        return func(
+            self._attrs,
+            template_path,
+        )
diff --git a/python/aitemplate/compiler/ops/tensor/permute102.py b/python/aitemplate/compiler/ops/tensor/permute102.py
index 37e6c3880..3c9674186 100644
--- a/python/aitemplate/compiler/ops/tensor/permute102.py
+++ b/python/aitemplate/compiler/ops/tensor/permute102.py
@@ -18,34 +18,13 @@
 """
 from typing import List
 
-import jinja2
+from aitemplate import backend
 
 from aitemplate.backend import registry
-
-from .... import backend
-from ...base import IntVar, Operator, Tensor
+from aitemplate.compiler.base import IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
-SHAPE_FUNC_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{dtype}}X_DIM0 = {{x_dim0}};
-{{indent}}{{dtype}}X_DIM1 = {{x_dim1}};
-{{indent}}{{dtype}}X_DIM2 = {{x_dim2}};
-{{indent}}{{dtype}}Y_DIM0 = X_DIM1;
-{{indent}}{{dtype}}Y_DIM1 = X_DIM0;
-{{indent}}{{dtype}}Y_DIM2 = X_DIM2;
-"""
-)
-
-SHAPE_ASSIGNMENT_TEMPLATE = jinja2.Template(
-    """
-{{indent}}{{y_dim0}} = Y_DIM0;
-{{indent}}{{y_dim1}} = Y_DIM1;
-{{indent}}{{y_dim2}} = Y_DIM2;
-"""
-)
-
 
 class permute102(Operator):
     """
@@ -75,30 +54,6 @@ class permute102(Operator):
     def __init__(self):
         super().__init__()
         self._attrs["op"] = "permute102"
-        self.shape_eval_template = SHAPE_FUNC_TEMPLATE
-        self.shape_save_template = SHAPE_ASSIGNMENT_TEMPLATE
-
-    def _infer_shape(self, x: List[int]):
-        """
-        Parameters
-        ----------
-        x : List[int]
-
-        Returns
-        -------
-        List[int]
-            Deduce output dimension based on SHAPE_ASSIGNMENT_TEMPLATE.
-        """
-        eval_func = self.shape_eval_template.render(
-            indent="",
-            dtype="",
-            x_dim0=x[0],
-            x_dim1=x[1],
-            x_dim2=x[2],
-        )
-        output = {}
-        exec(eval_func, output)  # noqa: P204
-        return [int(output["Y_DIM0"]), int(output["Y_DIM1"]), int(output["Y_DIM2"])]
 
     def _infer_shapes(self, x: Tensor) -> List[IntVar]:
         """Infers shapes for permute021."""
@@ -123,6 +78,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
@@ -137,6 +93,4 @@ def gen_function(self) -> str:
         return func(
             self._attrs,
             template_path,
-            self.shape_eval_template,
-            self.shape_save_template,
         )
diff --git a/python/aitemplate/compiler/ops/tensor/permute210.py b/python/aitemplate/compiler/ops/tensor/permute210.py
index a815adce6..70abe3baf 100644
--- a/python/aitemplate/compiler/ops/tensor/permute210.py
+++ b/python/aitemplate/compiler/ops/tensor/permute210.py
@@ -18,10 +18,10 @@
 """
 from typing import List
 
-from aitemplate.backend import registry
+from aitemplate import backend
 
-from .... import backend
-from ...base import Operator, Tensor
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
 
 # pylint: disable=C0103,W0221
 
@@ -91,6 +91,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         output = Tensor(output_shape, src_ops={self})
+        output._attrs["dtype"] = x.dtype()
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/tensor/relational.py b/python/aitemplate/compiler/ops/tensor/relational.py
new file mode 100644
index 000000000..699cdefee
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/relational.py
@@ -0,0 +1,120 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from enum import Enum
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class RelationalEnum(Enum):
+    GE = ">="
+    LE = "<="
+    LT = "<"
+    GT = ">"
+    EQ = "=="
+    NE = "!="
+
+
+class relational(Operator):
+    """
+    Relational operator that supports comparing a tensor to another tensor or a constant
+
+    Parameters:
+        left (Tensor): the tensor to compare
+
+        right (Tensor or float): the tensor or value to compare
+
+    Returns:
+        Tensor: a tensor of bool
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "relational"
+
+    def __call__(self, left: Tensor, right: Tensor) -> Tensor:
+        assert self._attrs["func"] is not None, "No function registered"
+        common_dtype = None
+        assert isinstance(
+            left, Tensor
+        ), "Relational expects left operand to be a Tensor"
+        common_dtype = normalize_dtype(left.dtype())
+        left._attrs["dtype"] = common_dtype
+
+        if isinstance(right, int) or isinstance(right, float):
+            right = Tensor(shape=[], value=right, dtype=common_dtype)
+        else:
+            assert isinstance(
+                right, Tensor
+            ), "Relational expects right operand to be a Tensor or constant"
+            assert (
+                normalize_dtype(right.dtype()) == common_dtype
+            ), f"Type promotions are not supported; got dtype {left.dtype()}, but expected {common_dtype}"
+            assert (
+                left.shape() == right.shape()
+            ), "Relational does not support broadcasting yet. It expects tensor of same shape."
+            right._attrs["dtype"] = common_dtype
+
+        self._attrs["args"] = [left, right]
+        self._attrs["inputs"] = [left] if right.is_a_const_num() else [left, right]
+        self._set_depth()
+        output = Tensor(left.shape(), src_ops=[self], dtype="bool")
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
+
+
+class ge(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.GE
+
+
+class le(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.LE
+
+
+class gt(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.GT
+
+
+class lt(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.LT
+
+
+class eq(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.EQ
+
+
+class ne(relational):
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["func"] = RelationalEnum.NE
diff --git a/python/aitemplate/compiler/ops/tensor/size.py b/python/aitemplate/compiler/ops/tensor/size.py
index e7cd1adf3..9607db1c5 100644
--- a/python/aitemplate/compiler/ops/tensor/size.py
+++ b/python/aitemplate/compiler/ops/tensor/size.py
@@ -21,7 +21,7 @@
 
 from aitemplate.backend import registry
 
-from ...base import IntImm, IntVar, IntVarTensor, Operator, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, Operator, Tensor
 
 # pylint: disable=C0103,W0221,R1732,W0613
 
diff --git a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
index cb8b34819..a109084d9 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_reshape_scatter.py
@@ -15,11 +15,14 @@
 """
 Slice_reshape_scatter.
 """
+from typing import Optional
 
-from .... import backend
-from ....backend import registry
-from ...base import IntImm, IntVar, Operator
-from ...stable_set import StableSet
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator
+from aitemplate.compiler.stable_set import StableSet
+
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,C0415,W0221
 
@@ -77,7 +80,7 @@ def is_valid(cat_op: Operator, reshape_op: Operator, cat_op_2: Operator) -> bool
         )
 
     def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
-        from ...transform import transform_utils
+        from aitemplate.compiler.transform import transform_utils
 
         idx = -1
         for i, input_tensor in enumerate(cat_op_2._attrs["inputs"]):
@@ -85,15 +88,44 @@ def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
                 idx = i
                 break
         assert idx >= 0
+        # The original output of this slice_reshape_scatter op is the output
+        # of the reshape op.
+        self._attrs["output_accessors"] = [
+            TensorAccessor(reshape_op._attrs["outputs"][0])
+        ]
         cat_op_2.remove_input_at(idx)
         transform_utils.remove_single_tensor_op_from_sorted_graph(reshape_op)
 
         self._attrs["inputs"] = [
             op._attrs["inputs"][0] for op in self._attrs["slice_ops"]
         ]
-        self._attrs["outputs"] = cat_op_2._attrs["outputs"]
+        cat_op_2_outputs = cat_op_2._attrs["outputs"]
+        assert len(cat_op_2_outputs) == 1, (
+            f'{cat_op_2._attrs["name"]=} may only have one output, but got more '
+            f"{cat_op_2_outputs=}"
+        )
+        self._attrs["outputs"] = cat_op_2_outputs
+
+        # setup output TensorAccessor
+        offset = 0
+        cat_dim = cat_op_2._attrs["concat_dim"]
+        orig_idx = -1
+        for i, input_tensor in enumerate(cat_op_2._attrs["original_inputs"]):
+            if input_tensor == reshape_op._attrs["outputs"][0]:
+                orig_idx = i
+                break
+            input_tensor_shape = input_tensor._attrs["shape"]
+            offset += input_tensor_shape[cat_dim].value()
+        assert orig_idx >= 0, (
+            f'could not find {input_tensor._attrs["name"]=} in the original_inputs'
+            "of cat_op_2"
+        )
+        self._attrs["output_accessors"][0].update_base_tensor(
+            cat_op_2_outputs[0], cat_dim, offset
+        )
+
         for x in self._attrs["inputs"]:
-            x._attrs["dst_ops"] = {self}
+            x._attrs["dst_ops"].add(self)
         for y in self._attrs["outputs"]:
             y._attrs["src_ops"].add(self)
 
@@ -108,29 +140,33 @@ def _update_inputs_outputs(self, cat_op, reshape_op, cat_op_2):
             y._attrs["src_ops"] = StableSet()
             y._attrs["dst_ops"] = StableSet()
 
-    def __init__(
-        self, cat_op: Operator, reshape_op: Operator, cat_op_2: Operator
-    ) -> None:
+    def __init__(self, scatter_dim: int, element_func: Optional[str] = None) -> None:
         super().__init__()
-        if cat_op_2._attrs["op"] == "concatenate_tanh":
-            self._attrs["element_func"] = "fast_tanh"
-        else:
-            self._attrs["element_func"] = None
-        assert slice_reshape_scatter.is_valid(cat_op, reshape_op, cat_op_2)
-
+        self._attrs["element_func"] = element_func
         self._attrs["op"] = "slice_reshape_scatter"
         self._attrs["has_profiler"] = False
-        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        self._attrs["scatter_dim"] = scatter_dim
+
+    @staticmethod
+    def make_op(cat_op: Operator, reshape_op: Operator, cat_op_2: Operator) -> Operator:
+        assert slice_reshape_scatter.is_valid(cat_op, reshape_op, cat_op_2)
+        element_func = None
+        if cat_op_2._attrs["op"] == "concatenate_tanh":
+            element_func = "fast_tanh"
+        scatter_dim = cat_op._attrs["concat_dim"]
+        new_op = slice_reshape_scatter(scatter_dim, element_func)
+
         slice_ops = []
         for x in cat_op._attrs["inputs"]:
             src_ops = x.src_ops()
             assert len(src_ops) == 1
             slice_op = list(src_ops)[0]
             slice_ops.append(slice_op)
-        self._attrs["slice_ops"] = slice_ops
+        new_op._attrs["slice_ops"] = slice_ops
 
-        self._update_inputs_outputs(cat_op, reshape_op, cat_op_2)
-        self._set_depth()
+        new_op._update_inputs_outputs(cat_op, reshape_op, cat_op_2)
+        new_op._set_depth()
+        return new_op
 
     def __call__(self):
         raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
diff --git a/python/aitemplate/compiler/ops/tensor/slice_scatter.py b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
index 02e3d4666..8c2d4f008 100644
--- a/python/aitemplate/compiler/ops/tensor/slice_scatter.py
+++ b/python/aitemplate/compiler/ops/tensor/slice_scatter.py
@@ -16,11 +16,11 @@
 Slice_scatter.
 """
 
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator
 from aitemplate.compiler.stable_set import StableSet
-
-from .... import backend
-from ....backend import registry
-from ...base import Operator
+from aitemplate.compiler.tensor_accessor import TensorAccessor
 
 # pylint: disable=C0103,W0221
 
@@ -36,6 +36,11 @@ class slice_scatter(Operator):
     def is_valid(cat_op: Operator) -> bool:
         if cat_op._attrs["op"] != "concatenate":
             return False
+        if any(
+            input_accessor.stride_dim is not None
+            for input_accessor in cat_op._attrs["input_accessors"]
+        ):
+            return False
         return all(
             x._attrs["src_ops"] is not None
             and len(x._attrs["src_ops"]) == 1
@@ -58,6 +63,10 @@ def _update_inputs_outputs(self, cat_op):
                 input_tensor._attrs["dst_ops"].add(self)
             self._attrs["inputs"].append(input_tensor)
 
+        # The original output of this slice_scatter op is the output of the cat_op.
+        # We set the TensorAccessor, but will only use its offset field in the backend.
+        self._attrs["output_accessors"] = [TensorAccessor(cat_op._attrs["outputs"][0])]
+
         self._attrs["outputs"] = cat_op._attrs["outputs"]
         for y in self._attrs["outputs"]:
             y._attrs["src_ops"] = StableSet({self})
@@ -70,23 +79,27 @@ def _update_inputs_outputs(self, cat_op):
             x._attrs["src_ops"] = StableSet()
             x._attrs["dst_ops"] = StableSet()
 
-    def __init__(self, cat_op: Operator) -> None:
+    def __init__(self, scatter_dim: int) -> None:
         super().__init__()
-        assert slice_scatter.is_valid(cat_op)
-
         self._attrs["op"] = "slice_scatter"
         self._attrs["has_profiler"] = False
-        self._attrs["scatter_dim"] = cat_op._attrs["concat_dim"]
+        self._attrs["scatter_dim"] = scatter_dim
+
+    @staticmethod
+    def make_op(cat_op: Operator) -> Operator:
+        assert slice_scatter.is_valid(cat_op)
+        scatter_dim = cat_op._attrs["concat_dim"]
+        new_op = slice_scatter(scatter_dim)
         slice_ops = []
         for x in cat_op._attrs["inputs"]:
             src_ops = x.src_ops()
             assert len(src_ops) == 1
             slice_op = list(src_ops)[0]
             slice_ops.append(slice_op)
-        self._attrs["slice_ops"] = slice_ops
-
-        self._update_inputs_outputs(cat_op)
-        self._set_depth()
+        new_op._attrs["slice_ops"] = slice_ops
+        new_op._update_inputs_outputs(cat_op)
+        new_op._set_depth()
+        return new_op
 
     def __call__(self):
         raise RuntimeError("op {} cannot be called directly".format(self._attrs["op"]))
@@ -102,3 +115,6 @@ def _get_func(self, fmt_str):
     def gen_function(self) -> str:
         func = self._get_func("{target}.{op}.gen_function")
         return func(self._attrs)
+
+    def _args_for_pseudo_code(self):
+        return [f"scatter_dim={str(self._attrs['scatter_dim'])}]"]
diff --git a/python/aitemplate/compiler/ops/tensor/split.py b/python/aitemplate/compiler/ops/tensor/split.py
index 56fbfeeeb..2f78b1c93 100644
--- a/python/aitemplate/compiler/ops/tensor/split.py
+++ b/python/aitemplate/compiler/ops/tensor/split.py
@@ -15,14 +15,12 @@
 """
 Split.
 """
-import itertools
-from typing import List
+from typing import List, Sequence, Union
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ....utils.tensor_utils import wrap_dim
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.utils.tensor_utils import wrap_dim
 
 # pylint: disable=C0103,W0221
 
@@ -77,37 +75,11 @@ def _infer_shapes(
                 f"sum of split_sizes ({split_sizes}) does not match split_dim_size ({split_dim_size})"
             )
 
-        x_shape_values = [var._attrs["values"] for var in x._attrs["shape"]]
-        x_shapes = itertools.product(*x_shape_values)
-        y_shapes = []
-        for x_shape_vals in x_shapes:
-            y_shape = [list(x_shape_vals) for _ in range(num_splits)]
-            for split_size, shape in zip(split_sizes, y_shape):
-                shape[dim] = split_size
-            y_shapes.append(y_shape)
-
-        def unique(vector):
-            return sorted(set(vector))
-
         output_shapes = []
-        for idx, shapes in enumerate(zip(*y_shapes)):
-            assert all(split_sizes[idx] == dims[dim] for dims in shapes)
-            output_shape = []
-            for i in range(len(shapes[0])):
-                dim_vals = unique(dims[i] for dims in shapes)
-                # propagate the name of each non-split-dim dynamic axis, which
-                # may be used later by some shape checks.
-                if i != dim:
-                    new_dim_val = shape_utils.gen_int_var(
-                        dim_vals, x_shape[i]._attrs["name"]
-                    )
-                else:
-                    # FIXME: we might want to create a new unique name for this
-                    # new_dim_val. We would do this once we have a mechanism
-                    # to create a unique dim name
-                    new_dim_val = shape_utils.gen_int_var(dim_vals)
-                output_shape.append(new_dim_val)
+        for split_size in split_sizes:
+            output_shape = x_shape[:dim] + [IntImm(split_size)] + x_shape[dim + 1 :]
             output_shapes.append(output_shape)
+
         return output_shapes
 
     def __call__(self, x: Tensor, split_size_or_sections, dim=0) -> List[Tensor]:
@@ -146,6 +118,9 @@ def __call__(self, x: Tensor, split_size_or_sections, dim=0) -> List[Tensor]:
             for output_shape in output_shapes
         ]
         self._attrs["outputs"] = outputs
+        self._attrs["original_outputs"] = list(outputs)
+        # True means the corresponding output tensor will be materialized by backend.
+        self._attrs["output_masks"] = [True] * len(outputs)
         # torch returns a tuple, so do we
         return tuple(outputs)
 
@@ -158,8 +133,69 @@ def gen_function(self) -> str:
         func = self._get_func("{target}.{op}.gen_function")
         return func(self._attrs)
 
+    def remove_output_at(self, indices: Union[int, Sequence[int]]) -> None:
+        """
+        This function removes the outputs in indices from the "outputs" attribute
+        and sets output_masks[indices] to be False. Note that the indices are based
+        on the current "outputs".
+
+        Parameters
+        ----------
+        indices : Union[int, Sequence[int]]
+            the index of an output or indices of multiple outputs based on the current "outputs"
+
+        Returns
+        -------
+        None
+        """
+        if isinstance(indices, int):
+            indices = [indices]
+        else:
+            indices = list(indices)
+
+        curr_outputs = self._attrs["outputs"]
+        num_curr_outputs = len(curr_outputs)
+
+        assert (
+            len(indices) <= num_curr_outputs
+        ), f"Expected len(indices) <= num_curr_outputs, but got {len(indices)} and {num_curr_outputs}"
+
+        num_original_outputs = len(self._attrs["original_outputs"])
+        num_output_masks = len(self._attrs["output_masks"])
+        assert num_original_outputs == num_output_masks, (
+            f"original_outputs and output_masks must have the same length, "
+            f"but got {num_original_outputs} and {num_output_masks}"
+        )
+
+        curr_idx = 0  # index into curr_outputs
+        idx = 0  # index into indices
+        new_outputs = []
+        # we need to skip those indices where output_masks have been modified.
+        for orig_idx in range(num_original_outputs):
+            if not self._attrs["output_masks"][orig_idx]:
+                continue
+            if idx < len(indices) and curr_idx == indices[idx]:
+                if not self._attrs["output_masks"][orig_idx]:
+                    raise RuntimeError(
+                        f'Expected input_masks at {idx} to be True for {self._attrs["name"]}'
+                    )
+                self._attrs["output_masks"][orig_idx] = False
+                idx += 1
+            else:
+                new_outputs.append(curr_outputs[curr_idx])
+            curr_idx += 1
+        num_new_outputs = len(new_outputs)
+        assert num_new_outputs + len(indices) == num_curr_outputs, (
+            f"Expected num_new_outputs + len(indices) == num_curr_outputs, "
+            f"but got {num_new_outputs + len(indices)} and {num_curr_outputs}"
+        )
+        self._attrs["outputs"] = new_outputs
+
     def _inputs_for_pseudo_code(self):
-        return self._attrs["inputs"] + [
+        return self._attrs["inputs"]
+
+    def _args_for_pseudo_code(self):
+        return [
             f"split_sizes={str(self._attrs['split_sizes'])}]",
             f"dim={str(self._attrs['split_dim'])}]",
         ]
diff --git a/python/aitemplate/compiler/ops/tensor/topk.py b/python/aitemplate/compiler/ops/tensor/topk.py
index 5796f41a1..74af5cde3 100644
--- a/python/aitemplate/compiler/ops/tensor/topk.py
+++ b/python/aitemplate/compiler/ops/tensor/topk.py
@@ -16,6 +16,7 @@
 Topk.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -24,13 +25,15 @@
 import jinja2
 import numpy as np
 
-from .... import backend
-from ....backend import registry
-from ....utils import logger
-from ...base import IntImm, IntVar, Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 EXEC_KEY_TEMPLATE = jinja2.Template(
     """
 elem_cnt == {{x_dim0}} &&  instance_size == {{x_dim1}} &&  instance_num == {{x_dim2}}
@@ -55,8 +58,8 @@ class topk(Operator):
         .. code-block:: python
 
             X = Tensor(shape=[2, 800], name="X", is_input=True)
-            Y = ops.topk(k=300)(X)
-            y_shape = [d._attrs["values"][0] for d in Y.shape()]
+            value, indice = ops.topk(k=300)(X)
+            y_shape = [d._attrs["values"][0] for d in indice.shape()]
             print(y_shape)
 
             Outs:
@@ -83,8 +86,10 @@ def __call__(self, x: Tensor) -> Tensor:
         self._set_depth()
         output_shape = self._infer_shapes(x)
         self._extract_exec_path(x)
-        output = Tensor(output_shape, src_ops={self}, dtype="int64")
-        self._attrs["outputs"] = [output]
+        output_index = Tensor(output_shape, src_ops={self}, dtype="int64")
+        output_value = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
+        output = (output_value, output_index)
+        self._attrs["outputs"] = [output_value, output_index]
         return output
 
     def _get_op_attributes(self):
@@ -148,7 +153,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[1])
         cmd.append(x_shape[2])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -195,8 +200,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/tensor/transpose.py b/python/aitemplate/compiler/ops/tensor/transpose.py
new file mode 100644
index 000000000..5ad8d298d
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/transpose.py
@@ -0,0 +1,34 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+transpose op
+"""
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.tensor.permute import permute
+
+
+class transpose(permute):
+    """
+    Returns a tensor with its two dimensions transposed.
+    This returned tensor is not a view. Dims can be negative.
+    """
+
+    def __call__(self, x: Tensor, dim0: int, dim1: int) -> Tensor:
+        dims = list(range(x._rank()))
+        dims[dim0] = dim1
+        dims[dim1] = dim0
+
+        return super().__call__(x, dims)
diff --git a/python/aitemplate/compiler/ops/tensor/where.py b/python/aitemplate/compiler/ops/tensor/where.py
new file mode 100644
index 000000000..4be14790c
--- /dev/null
+++ b/python/aitemplate/compiler/ops/tensor/where.py
@@ -0,0 +1,102 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+class where(Operator):
+    """
+    Return a tensor of elements selected from either input or other, depending on condition.
+
+    Parameters:
+        condition (A bool Tensor): When True (nonzero), yield input, otherwise yield other
+
+        input_tensor (Tensor or Scalar): value (if input is a scalar) or values selected at indices where condition is True
+
+        other_tensor (Tensor or Scalar): value (if other is a scalar) or values selected at indices where condition is False
+
+        dtype: output dtype if both input_tensor and output_tensor is scalar
+    Returns:
+        Tensor: A tensor of shape equal to the shape of condition
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._attrs["op"] = "where"
+
+    def __call__(
+        self,
+        condition: Tensor,
+        input_tensor: Tensor,
+        other_tensor: Tensor,
+        dtype: str = "",
+    ) -> Tensor:
+        assert isinstance(
+            condition, Tensor
+        ), f"condition needs to be a tensor, but got {type(condition)}"
+        assert (
+            condition.dtype() == "bool"
+        ), f"condition needs to be a bool tensor, but got {condition.dtype()}"
+
+        output_shape = condition.shape()
+        args = []
+        inputs = []
+        common_dtype = None
+        for tensor in [input_tensor, other_tensor]:
+            if isinstance(tensor, int) or isinstance(tensor, float):
+                tensor = Tensor(shape=[], value=tensor, dtype=common_dtype)
+            else:
+                assert isinstance(
+                    tensor, Tensor
+                ), f"Unsupported data type: {type(tensor)}"
+                assert (
+                    tensor.shape() == output_shape
+                ), f"Tensor shape should be the same, {tensor.shape()} != {output_shape}"
+                if common_dtype is None:
+                    common_dtype = normalize_dtype(tensor.dtype())
+                else:
+                    assert common_dtype == normalize_dtype(
+                        tensor.dtype()
+                    ), f"Expect tensor of the same dtype, got {common_dtype} and {normalize_dtype(tensor.dtype())}"
+                inputs.append(tensor)
+
+            args.append(tensor)
+
+        # In case where both inputs are scalars,
+        if len(inputs) == 0:
+            assert dtype != "", "dtype needs to be provided for scalars"
+            common_dtype = normalize_dtype(dtype)
+            for arg in args:
+                arg._attrs["dtype"] = common_dtype
+        self._attrs["args"] = [condition, *args]
+        self._attrs["inputs"] = [condition, *inputs]
+        self._set_depth()
+        output = Tensor(
+            shape=output_shape,
+            src_ops={self},
+            dtype=common_dtype,
+        )
+        self._attrs["outputs"] = [output]
+        return output
+
+    def gen_function(self) -> str:
+        target = backend.target.Target.current()
+        func_key = f"{target.name()}.{self._attrs['op']}.gen_function"
+        func = registry.get(func_key)
+        return func(self._attrs)
diff --git a/python/aitemplate/compiler/ops/upsample/__init__.py b/python/aitemplate/compiler/ops/upsample/__init__.py
index 54712bf9b..6af4b174d 100644
--- a/python/aitemplate/compiler/ops/upsample/__init__.py
+++ b/python/aitemplate/compiler/ops/upsample/__init__.py
@@ -15,8 +15,8 @@
 """
 Upsampling module init.
 """
-from .upsampling2d import upsampling2d
-from .upsampling2d_add import upsampling2d_add
+from aitemplate.compiler.ops.upsample.upsampling2d import upsampling2d
+from aitemplate.compiler.ops.upsample.upsampling2d_add import upsampling2d_add
 
 
 __all__ = ["upsampling2d", "upsampling2d_add"]
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d.py b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
index 747cf0291..e53d4aed0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling2d.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d.py
@@ -15,7 +15,7 @@
 """
 Upsampling2d op.
 """
-from .upsampling_common import upsampling2d_base
+from aitemplate.compiler.ops.upsample.upsampling_common import upsampling2d_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
index 16632d4fe..e63c2c560 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling2d_add.py
@@ -17,8 +17,8 @@
 """
 from typing import List
 
-from ...base import Tensor
-from .upsampling_common import upsampling2d_base
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.upsample.upsampling_common import upsampling2d_base
 
 
 # pylint: disable=C0103
@@ -51,6 +51,6 @@ def __call__(self, x: Tensor, r: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
diff --git a/python/aitemplate/compiler/ops/upsample/upsampling_common.py b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
index 5f0a83344..edf87bba0 100644
--- a/python/aitemplate/compiler/ops/upsample/upsampling_common.py
+++ b/python/aitemplate/compiler/ops/upsample/upsampling_common.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from .... import backend
-from ....backend import registry
-from ....utils import shape_utils
-from ...base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
@@ -121,11 +121,20 @@ def unique(vector):
             return sorted(set(vector))
 
         output_shape = [
-            shape_utils.gen_int_var(unique([d[0] for d in y_shapes])),
+            x.shape()[0],
             shape_utils.gen_int_var(unique([d[1] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[2] for d in y_shapes])),
             shape_utils.gen_int_var(unique([d[3] for d in y_shapes])),
         ]
+
+        in_h = x._attrs["shape"][1]._attrs["symbolic_value"]
+        in_w = x._attrs["shape"][2]._attrs["symbolic_value"]
+        out_h = in_h * int(self._attrs["scale_factor"])
+        out_w = in_w * int(self._attrs["scale_factor"])
+
+        output_shape[1]._attrs["symbolic_value"] = out_h
+        output_shape[2]._attrs["symbolic_value"] = out_w
+
         return output_shape
 
     def _invert_exec_key(self, key):
@@ -152,7 +161,7 @@ def __call__(self, x: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/__init__.py
index 29b76d99a..9f32e81ed 100644
--- a/python/aitemplate/compiler/ops/vision_ops/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/__init__.py
@@ -15,5 +15,5 @@
 """
 Vision ops module init.
 """
-from .nms import *  # noqa
-from .roi_ops import *  # noqa
+from aitemplate.compiler.ops.vision_ops.nms import *  # noqa
+from aitemplate.compiler.ops.vision_ops.roi_ops import *  # noqa
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
index c26c09e00..eea6045e6 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/__init__.py
@@ -15,9 +15,9 @@
 """
 Nms family ops.
 """
-from .batched_nms import batched_nms
-from .efficient_nms import efficient_nms
-from .nms import nms
+from aitemplate.compiler.ops.vision_ops.nms.batched_nms import batched_nms
+from aitemplate.compiler.ops.vision_ops.nms.efficient_nms import efficient_nms
+from aitemplate.compiler.ops.vision_ops.nms.nms import nms
 
 
 __all__ = ["batched_nms", "nms", "efficient_nms"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
index c44c8d30f..686beea1d 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/batched_nms.py
@@ -20,10 +20,15 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import _create_host_zero_tensor, IntImm, Operator, Tensor  # noqa
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import (  # noqa
+    _create_host_zero_tensor,
+    IntImm,
+    Operator,
+    Tensor,
+)
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
index 8f0ca622d..57e4ae9c7 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/efficient_nms.py
@@ -16,6 +16,7 @@
 Efficient nms.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -23,14 +24,16 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import logger, shape_utils
-from ....base import IntImm, Operator, Tensor
-
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 # TODO: change to column last
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
@@ -139,11 +142,20 @@ def __call__(self, boxes: Tensor, scores: Tensor) -> Tensor:
         self._extract_exec_path(boxes)
         output_shape = self._infer_shapes(boxes, scores)
 
+        x = boxes
         num_detections = Tensor(
             [output_shape[0], IntImm(1)], dtype="int64", src_ops={self}
         )
-        detection_boxes = Tensor(output_shape, src_ops={self})
-        detection_scores = Tensor(output_shape[:-1], src_ops={self})
+        detection_boxes = Tensor(
+            output_shape,
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
+        detection_scores = Tensor(
+            output_shape[:-1],
+            src_ops={self},
+            dtype=x._attrs["dtype"],
+        )
         detection_classes = Tensor(output_shape[:-1], dtype="int64", src_ops={self})
         output = (num_detections, detection_boxes, detection_scores, detection_classes)
         self._attrs["outputs"] = [
@@ -210,7 +222,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[1] * x_shape[2])
         cmd.append(x_shape[2])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -245,8 +257,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
index 2d7282246..c21a06c35 100644
--- a/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
+++ b/python/aitemplate/compiler/ops/vision_ops/nms/nms.py
@@ -16,6 +16,7 @@
 Nms.
 """
 import itertools
+import logging
 import os
 import re
 from collections import OrderedDict
@@ -23,13 +24,16 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import logger, shape_utils
-from ....base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,W0102,W0223
 
+
+_LOGGER = logging.getLogger(__name__)
+
 # TODO: change to column last
 SHAPE_FUNC_TEMPLATE = jinja2.Template(
     """
@@ -135,7 +139,7 @@ def __call__(self, x: Tensor, scores: Tensor) -> Tensor:
         self._set_depth()
         output_shape = self._infer_shapes(x, scores)
         self._extract_exec_path(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
@@ -193,7 +197,7 @@ def _gen_profile_cmd(self, profiler_prefix, cfg, x_shape):
         cmd.append(x_shape[0])
         cmd.append(x_shape[1])
         command = [str(x) for x in cmd]
-        logger.info(__name__, "profiling cmd: {}".format(command))
+        _LOGGER.info("profiling cmd: {}".format(command))
         return command
 
     def _profile_single_workload(self, profiler_prefix, exec_key, devices):
@@ -229,8 +233,7 @@ def profile(
         profiler_prefix = os.path.join(workdir, "profiler", self._attrs["op"])
 
         for wkl in workloads:
-            logger.info(
-                __name__,
+            _LOGGER.info(
                 "Profile: {name}: {wkl}".format(name=self._attrs["name"], wkl=wkl),
             )
             workspace = self._profile_single_workload(profiler_prefix, wkl, devices)
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
index 0d8619521..19edd785a 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/__init__.py
@@ -15,7 +15,9 @@
 """
 Roi-align module init.
 """
-from .multi_level_roi_align import multi_level_roi_align
-from .roi_align import roi_align
+from aitemplate.compiler.ops.vision_ops.roi_ops.multi_level_roi_align import (
+    multi_level_roi_align,
+)
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_align import roi_align
 
 __all__ = ["roi_align", "multi_level_roi_align"]
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
index 190c06207..f5e2701b6 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/multi_level_roi_align.py
@@ -18,8 +18,8 @@
 
 from typing import List
 
-from ....base import Tensor
-from .roi_ops import roi_ops_base
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_ops import roi_ops_base
 
 # pylint: disable=C0103
 
@@ -114,7 +114,7 @@ def __call__(
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
index dbc6b13da..cdcb0fa80 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_align.py
@@ -15,7 +15,7 @@
 """
 Roi_align.
 """
-from .roi_ops import roi_ops_base
+from aitemplate.compiler.ops.vision_ops.roi_ops.roi_ops import roi_ops_base
 
 
 # pylint: disable=C0103
diff --git a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
index a962e20ad..39b2ab046 100644
--- a/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
+++ b/python/aitemplate/compiler/ops/vision_ops/roi_ops/roi_ops.py
@@ -23,10 +23,10 @@
 
 import jinja2
 
-from ..... import backend
-from .....backend import registry
-from .....utils import shape_utils
-from ....base import Operator, Tensor
+from aitemplate import backend
+from aitemplate.backend import registry
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils import shape_utils
 
 # pylint: disable=C0103,W0221,R1732,W0613
 logging.basicConfig(level=logging.INFO)
@@ -194,7 +194,7 @@ def __call__(self, x: Tensor, rois: Tensor) -> List[Tensor]:
         self._set_depth()
         self._extract_exec_path(x)
         output_shape = self._infer_shapes(x)
-        output = Tensor(output_shape, src_ops={self})
+        output = Tensor(output_shape, src_ops={self}, dtype=x._attrs["dtype"])
         self._attrs["outputs"] = [output]
         return output
 
diff --git a/python/aitemplate/compiler/public/__init__.py b/python/aitemplate/compiler/public/__init__.py
index d0b664a3b..fcd5b2e45 100644
--- a/python/aitemplate/compiler/public/__init__.py
+++ b/python/aitemplate/compiler/public/__init__.py
@@ -38,8 +38,7 @@
 from aitemplate.compiler.ops.common.int_elementwise import int_elementwise
 
 """GEMM"""
-from aitemplate.compiler.ops.gemm_universal.bmm_rcr import bmm_rcr
-from aitemplate.compiler.ops.gemm_universal.bmm_rrr import bmm_rrr
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr, bmm_rrr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr import gemm_rcr
 from aitemplate.compiler.ops.gemm_universal.gemm_rcr_bias import gemm_rcr_bias
 from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
@@ -57,15 +56,22 @@
 from aitemplate.compiler.ops.conv.conv2d import conv2d
 from aitemplate.compiler.ops.conv.conv2d_bias import conv2d_bias
 from aitemplate.compiler.ops.conv.conv2d_bias_relu import conv2d_bias_relu
+from aitemplate.compiler.ops.conv.conv3d import conv3d
+from aitemplate.compiler.ops.conv.conv3d_bias import conv3d_bias
+from aitemplate.compiler.ops.conv.depthwise_conv3d import depthwise_conv3d
+from aitemplate.compiler.ops.conv.transposed_conv2d import transposed_conv2d
+from aitemplate.compiler.ops.conv.transposed_conv2d_bias import transposed_conv2d_bias
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
 from aitemplate.compiler.ops.layernorm.group_layernorm import group_layernorm
 from aitemplate.compiler.ops.layernorm.group_layernorm_sigmoid_mul import (
     group_layernorm_sigmoid_mul,
 )
 from aitemplate.compiler.ops.layernorm.layernorm import layernorm
-from aitemplate.compiler.ops.padding import nhwc3to8, pad_last_dim
+from aitemplate.compiler.ops.padding import ndhwc3to8, nhwc3to8, pad_last_dim
 from aitemplate.compiler.ops.pool.avg_pool2d import avg_pool2d
 from aitemplate.compiler.ops.pool.max_pool2d import max_pool2d
 from aitemplate.compiler.ops.softmax.softmax import softmax
+from aitemplate.compiler.ops.tensor.masked_select import masked_select
 from aitemplate.compiler.ops.tensor.size import size
 from aitemplate.compiler.ops.tensor.topk import topk
 
@@ -74,6 +80,8 @@
 from aitemplate.compiler.ops.tensor.concatenate import concatenate
 from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.ops.tensor.expand import expand
+from aitemplate.compiler.ops.tensor.full import full
+from aitemplate.compiler.ops.tensor.identity import identity
 from aitemplate.compiler.ops.tensor.permute import permute
 from aitemplate.compiler.ops.tensor.split import split
 
diff --git a/python/aitemplate/compiler/stable_set.py b/python/aitemplate/compiler/stable_set.py
index 84a5704d7..82f945078 100644
--- a/python/aitemplate/compiler/stable_set.py
+++ b/python/aitemplate/compiler/stable_set.py
@@ -19,12 +19,12 @@
 potentially make debugging (e.g. comparison with the original graph, comparison between
 AIT GPU trace and other GPU traces) easier.
 """
+from collections import abc
+from typing import Any, Iterable
 
-from typing import Any, Sequence
 
-
-class StableSet:
-    def __init__(self, s: Sequence[Any] = None):
+class StableSet(abc.MutableSet):
+    def __init__(self, s: Iterable[Any] = None):
         if s is None:
             s = []
         self._d = {item: None for item in s}
diff --git a/python/aitemplate/compiler/symbolic.py b/python/aitemplate/compiler/symbolic.py
new file mode 100644
index 000000000..2edcfb195
--- /dev/null
+++ b/python/aitemplate/compiler/symbolic.py
@@ -0,0 +1,154 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Symbolic helpers for AITemplate.
+AITemplate leverages Sympy to do symbolic computations for shapes.
+The core of Sympy is surrounded around the class "Symbol". We could apply operations
+on Symbols (i.e. add/mul/power/etc.) Which could help us do basic arithmetic with
+unknown values.
+The symbolic-ness comes from representation that includes Symbol (i.e. sym_1 + 100.)
+
+Example Usage:
+A = IntVar(...)
+sym_A = A.symbolic_value() # equivalent of A._attrs["symbolic_value"]
+
+# do something about sym_A, some common usage include:
+new_sym = sym_A + 100
+new_sym = sym_A - 100
+new_sym = sym_A * 2
+new_sym = sym_A * sym_B
+
+# We could then assign the symbolic value to a new IntVar.
+new_var = IntVar(..., symbolic_value=new_sym)
+
+For more advanced usage on Sympy, check: https://docs.sympy.org/latest/tutorials/intro-tutorial/intro.html
+"""
+from __future__ import annotations
+
+import itertools
+
+from numbers import Number
+from typing import Any, List, Optional, Set
+
+import sympy
+
+
+_k_symbolic_to_intvar = {}
+_k_symbolic_index = 0
+_k_symbolic_value = {}
+
+
+def create_new_symbol(
+    name: Optional[str] = None,
+    values: Optional[List[int]] = None,
+    check_duplicate: bool = False,
+) -> sympy.Symbol:
+    """
+    Creates and memoizing symbols.
+
+    Parameters
+    ----------
+    name : Optional[str]
+        The symbol name that is going to be used. If None is provided, an unused
+        name would be created.
+    values : Optional[List[int]]
+        The values for IntVar, which indicates the range of which the symbol could
+        represent.
+    check_duplicate : bool
+        If set as True and name is provided, we check whether the name and values
+        provided matches the corresponding symbol recorded.
+    """
+    global _k_symbolic_index
+    global _k_symbolic_value
+
+    if name is None:
+        while True:
+            name = f"_sym_{_k_symbolic_index}"
+            _k_symbolic_index += 1
+
+            if name not in _k_symbolic_value:
+                break
+
+    values = sorted(set(values)) if values is not None else values
+    if (
+        check_duplicate
+        and name in _k_symbolic_value
+        and _k_symbolic_value[name] != values
+    ):
+        raise ValueError(
+            f"Symbol ({name}) has different values! New value is {values}, stored value is {_k_symbolic_value[name]}"
+        )
+
+    _k_symbolic_value[name] = values
+    return sympy.Symbol(name)
+
+
+def is_symbol(sym_val: Any) -> bool:
+    return isinstance(sym_val, sympy.Symbol)
+
+
+def is_symbolic(sym_val: Any) -> bool:
+    """
+    Check whether sym_val is a sympy class.
+    """
+    return isinstance(sym_val, sympy.Basic)
+
+
+def is_integer(sym_val: Any) -> bool:
+    # We wrap this since None is returned if sympy can't determine the property.
+    if is_symbolic(sym_val):
+        return sym_val.is_number and int(sym_val) - sym_val == 0
+    elif isinstance(sym_val, Number):
+        return int(sym_val) - sym_val == 0
+
+    return False
+
+
+def get_global_symbol_set() -> Set:
+    global _k_symbolic_value
+    return set(_k_symbolic_value.keys())
+
+
+def get_intvar(sym_name: str):
+    global _k_symbolic_to_intvar
+
+    return _k_symbolic_to_intvar.get(sym_name, None)
+
+
+def store_intvar(sym_name: str, int_var) -> None:
+    global _k_symbolic_to_intvar
+
+    _k_symbolic_to_intvar[sym_name] = int_var
+
+
+def simplify_intvar_values(sym_val: sympy.Basic):
+    """
+    Given a symbolic value, resolve the symbol's value range.
+
+    Example:
+    'symbol_A' has value range of [10, 20]
+    simplify_intvar_values(symbol_A * 3 + 4) returns [34, 64]
+    """
+    global _k_symbolic_value
+
+    symbols = list(sym_val.free_symbols)
+    symbol_shapes = [_k_symbolic_value[s.name] for s in symbols]
+    symbol_shapes = [s for s in symbol_shapes if s is not None]
+    shape_perms = list(itertools.product(*symbol_shapes))
+
+    new_shape = [int(sym_val.subs(zip(symbols, s))) for s in shape_perms]
+    new_shape = sorted(set(new_shape))
+
+    return new_shape
diff --git a/python/aitemplate/compiler/tensor_accessor.py b/python/aitemplate/compiler/tensor_accessor.py
index b6276ed38..f2f554f0c 100644
--- a/python/aitemplate/compiler/tensor_accessor.py
+++ b/python/aitemplate/compiler/tensor_accessor.py
@@ -22,16 +22,15 @@
 # pylint: disable=C0103,C0301,W0612
 
 from pprint import pformat
-from typing import Any, List, Optional
+from typing import Any, Iterable, List, Optional
 
-from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
 
-from .base import IntImm, Tensor
 
-logger = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
 
 
-class TensorAccessor(object):
+class TensorAccessor:
     """
     A tensor accessor which manages how to access a Tensor.
     Must always be used together with a Tensor.
@@ -45,10 +44,12 @@ def __init__(self, original_tensor: Tensor) -> None:
         # Tensor offset in terms of number of elements compared to the base tensor.
         self.offset = 0
         self.original_shapes = original_tensor._attrs["shape"]
+        # We need dtype for computing alignment requirement
+        self.tensor_dtype = original_tensor.dtype()
         # This strictly means that the tensor's memory itself is contiguous
         self.is_contiguous = True
 
-        ## These variables are only set when self.stride_dim != None.
+        # These variables are only set when self.stride_dim != None.
         # A tensor can be contiguous and still come from a strided tensor,
         # e.g., when stride_dim == 0
         self.is_from_strided_tensor = False
@@ -78,7 +79,7 @@ def __init__(self, original_tensor: Tensor) -> None:
         # between self.original_shapes and self.actual_shapes.
         # e.g. The original tensor is in shape [2, 3, 2], and it's reshaped to [2, 6].
         # In this case, self._dim_mapping = [([0], [0]), ([1, 2], [1])], which represents
-        # that self.orignal_shapes[0] and self.actual_shapes[0] are in the same group,
+        # that self.original_shapes[0] and self.actual_shapes[0] are in the same group,
         # and self.original_shapes[1:2] and self.actual_shapes[1] are in the same group.
         #
         # It's possible that such a mapping cannot be calculated (e.g. because of
@@ -204,7 +205,7 @@ def _try_gen_dim_mapping(self):
             or original_idx != len(original_shapes)
             or actual_idx != len(actual_shapes)
         ):
-            logger.debug(f"tail processing failed, dim_mapping: {dim_mapping}")
+            _LOGGER.debug(f"tail processing failed, dim_mapping: {dim_mapping}")
             return
 
         # Remove the last dummy group.
@@ -212,7 +213,7 @@ def _try_gen_dim_mapping(self):
 
         # Assign new dim_mapping to self._dim_mapping.
         self._dim_mapping = dim_mapping
-        logger.debug(f"generate dim_mapping: {dim_mapping}")
+        _LOGGER.debug(f"generate dim_mapping: {dim_mapping}")
 
     def try_get_stride_strs(
         self, dim: int, dim_names: List[str] = None
@@ -234,7 +235,9 @@ def try_get_stride_strs(
                 f"dim_names: {dim_names}, shapes: {self.original_shapes}"
             )
 
-        def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
+        def _get_value_or_names(
+            shape: List[IntVar], indices: Iterable[int]
+        ) -> List[str]:
             res = []
             for index in indices:
                 d = shape[index]
@@ -255,13 +258,13 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
         if self._dim_mapping is None:
             # self._dim_mapping cannot be generated successfully.
             # Return None to represent an error.
-            logger.debug("Failed to get dim mapping.")
+            _LOGGER.debug("Failed to get dim mapping.")
             return None
 
         # Loop through self._dim_mapping to generate stride_strs.
         found_original_dim_group = False
         res = []
-        for (original_group, actual_group) in self._dim_mapping:
+        for original_group, actual_group in self._dim_mapping:
             if not found_original_dim_group:
                 if dim in original_group:
                     found_original_dim_group = True
@@ -273,7 +276,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
                         # need to make sure that dim is the last dim
                         # inside the original group.
                         # Otherwise, we cannot compute strides.
-                        logger.debug(
+                        _LOGGER.debug(
                             "Multiple dims in stride_dim group. "
                             f"dim_mapping: {self._dim_mapping}, "
                             f"dim: {dim}, stride_dim: {self.stride_dim}, self: {self}"
@@ -287,7 +290,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
             else:
                 if self.stride_dim in actual_group:
                     if actual_group.index(self.stride_dim) != 0:
-                        logger.debug(
+                        _LOGGER.debug(
                             f"Stride dim {self.stride_dim} is not the first dim "
                             f"of the underlying group {actual_group}."
                         )
@@ -298,7 +301,7 @@ def _get_value_or_names(shape: List[IntVar], indices: List[int]) -> List[str]:
                         _get_value_or_names(self.original_shapes, original_group)
                     )
 
-        logger.debug(
+        _LOGGER.debug(
             f"dim: {dim}, stride_dim: {self.stride_dim}, "
             f"mapping: {self._dim_mapping}, stride_strs: {res}, "
             f"original: {self.original_shapes}, actual: {self.actual_shapes}"
@@ -328,7 +331,7 @@ def stride(self, dim: int) -> int:
             stride *= int(s)
         return stride
 
-    def gen_stride_str(self, dim: int, dim_names: List[str]) -> int:
+    def gen_stride_str(self, dim: int, dim_names: List[str]) -> str:
         """
         Returns the str to calculate the stride of a certain dim. This is
         a temporary solution to get around dynamic shapes problems with
@@ -350,7 +353,7 @@ def update_base_tensor(
         """
         Updates the TensorAccessor with a new base tensor.
         This API is useful to handle ops with a stride dim, e.g. split, cat.
-        It can also used by slice if slice is only operated on one dim.
+        It can also be used by slice if slice is only operated on one dim.
         """
 
         assert (
@@ -445,3 +448,33 @@ def update_base_tensor_shape(self, new_tensor: Tensor) -> None:
             f"actual tensor: {self.actual_shapes}!"
         )
         self._try_gen_dim_mapping()
+
+    def is_rightmost_dim_contiguous(self, cat_dim: int) -> bool:
+        """Check if the rightmost diminsion would be contiguous after
+        concatenation along a given cat_dim. This is a necessary condition for
+        GEMM+concat fusion, since GEMM doesn't support discontinuous rightmost
+        dimension for row-major outout. Rightmost diminsion is contiguous iff
+        the concat dimension corresponds to one of the dimensions in the
+        original shape and it's the first dimension in its group of actual
+        dimensions.
+        """
+        num_groups = len(self._dim_mapping)
+        for group_idx in range(num_groups):
+            original_group, actual_group = self._dim_mapping[group_idx]
+            if cat_dim in actual_group:
+                if actual_group.index(cat_dim):
+                    # Concat dimension isn't the first in its group
+                    return False
+                # Check that there is at least one non-empty original group to the
+                # right of the group where cat_dim found (inclusive)
+                while (group_idx < num_groups) and not len(
+                    self._dim_mapping[group_idx][0]
+                ):
+                    group_idx += 1
+                if group_idx >= num_groups:
+                    # There are no original dimensions to the right (inclusive) of concat
+                    # dimension. Concat dimension is an unsqueezed dimension at the end
+                    # of the shape, fusion is impossible.
+                    return False
+                return True
+        return False
diff --git a/python/aitemplate/compiler/transform/__init__.py b/python/aitemplate/compiler/transform/__init__.py
index b336aac0c..c195d4087 100644
--- a/python/aitemplate/compiler/transform/__init__.py
+++ b/python/aitemplate/compiler/transform/__init__.py
@@ -13,28 +13,39 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .bind_constants import bind_constants
-from .constant_folding import constant_folding
-from .fuse_conv_elementwise import fuse_conv_elementwise
-from .fuse_group_ops import (
+from aitemplate.compiler.transform.bind_constants import bind_constants
+from aitemplate.compiler.transform.constant_folding import constant_folding
+from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_expand_bmm import fuse_expand_bmm
+from aitemplate.compiler.transform.fuse_group_ops import (
     fuse_group_gemm_ops,
     fuse_group_layernorm_ops,
     fuse_group_ops,
 )
-from .fuse_mm_elementwise import fuse_mm_elementwise
-from .fuse_ops import fuse_ops
-from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
-from .mark_param_tensor import mark_param_tensor, mark_special_views
-from .memory_planning import memory_planning
-from .name_graph import name_graph
-from .optimize_graph import optimize_graph
-from .profile import profile
-from .refine_graph import refine_graph
-from .remove_no_ops import remove_no_ops
-from .remove_unused_ops import remove_unused_ops
-from .split_large_concat_ops import split_large_concat_ops
-from .toposort import toposort
-from .transform_memory_ops import transform_memory_ops
-from .transform_odd_alignment import transform_odd_alignment
-from .transform_special_ops import transform_special_ops
-from .transform_strided_ops import transform_strided_ops
+from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
+from aitemplate.compiler.transform.fuse_ops import fuse_ops
+from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
+    fuse_permute_bmm_and_gemm,
+)
+from aitemplate.compiler.transform.mark_param_tensor import (
+    mark_param_tensor,
+    mark_special_views,
+)
+from aitemplate.compiler.transform.memory_planning import memory_planning
+from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
+from aitemplate.compiler.transform.name_graph import dedup_symbolic_name, name_graph
+from aitemplate.compiler.transform.optimize_graph import optimize_graph
+from aitemplate.compiler.transform.profile import profile
+from aitemplate.compiler.transform.refine_graph import refine_graph
+from aitemplate.compiler.transform.remove_no_ops import remove_no_ops
+from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
+from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
+from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_merge_slice_ops import merge_slice_ops
+from aitemplate.compiler.transform.transform_odd_alignment import (
+    transform_odd_alignment,
+)
+from aitemplate.compiler.transform.transform_special_ops import transform_special_ops
+from aitemplate.compiler.transform.transform_strided_ops import transform_strided_ops
diff --git a/python/aitemplate/compiler/transform/apply_padding.py b/python/aitemplate/compiler/transform/apply_padding.py
index 5041d889e..67a7343bc 100644
--- a/python/aitemplate/compiler/transform/apply_padding.py
+++ b/python/aitemplate/compiler/transform/apply_padding.py
@@ -15,15 +15,19 @@
 """
 Applies paddings to gemms based on alignment requirements.
 """
+import logging
 from typing import Callable, Dict, List
 
-from aitemplate.compiler.base import _create_host_zero_tensor
+from aitemplate.compiler import ops
 
-from ...utils import logger
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal.gemm_common import DimInfo, gemm, Source
-from . import transform_utils
+from aitemplate.compiler.base import _create_host_zero_tensor, IntImm, Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import DimInfo, gemm, Source
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import alignment
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _extract_mnk_name(
@@ -36,8 +40,8 @@ def _extract_mnk_name(
     return None
 
 
-def _get_padding_length(original_length: int) -> int:
-    if original_length % 2 == 0:
+def get_padding_length(original_length: int, dtype: str) -> int:
+    if alignment.valid_alignment(original_length, dtype):
         return 0
 
     # TODO(yingz): Tune padding strategy.
@@ -83,8 +87,7 @@ def _pad_input_tensor(
         tensor_list.append(padding_tensor)
         tensor_list.append(padded_tensor)
 
-        logger.debug(
-            __name__,
+        _LOGGER.debug(
             "**** Apply padding ****, replace input tensor \n {} \n with \n {} \n".format(
                 original_tensor_debug_str, padded_tensor
             ),
@@ -142,6 +145,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
                 or isinstance(op, ops.gemm_rrr_small_nk)
                 or isinstance(op, ops.bmm_rcr_n1)
                 or isinstance(op, ops.bmm_rrr_k1_tanh)
+                or "permute" in op._attrs["op"]
             ):
                 continue
 
@@ -168,15 +172,16 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
                         "Gemm does not support dynamic alignment dimensions "
                         "(i.e. alignment==1)! Gemm: {}".format(op)
                     )
-                padding_length = _get_padding_length(alignment_dim.value())
+                padding_length = get_padding_length(
+                    alignment_dim.value(), tensor.dtype()
+                )
                 if padding_length > 0:
                     alignment_var_to_padding_length[alignment_var] = padding_length
             if len(alignment_var_to_padding_length) == 0:
                 # No padding is necessary.
                 continue
 
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 "**** Apply padding ****, alignment_var_to_padding_length: \n {} \n".format(
                     alignment_var_to_padding_length
                 ),
@@ -215,7 +220,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
             # Replaces the old op with the new op.
             for tensor_input in op._attrs["inputs"]:
                 tensor_input._attrs["dst_ops"].discard(op)
-            new_op = type(op)()
+            new_op = type(op)(**op._get_op_attributes())
             new_op._attrs["split_k"] = op._attrs["split_k"]
             if "alpha" in op._attrs:
                 new_op._attrs["alpha"] = op._attrs["alpha"]
@@ -231,8 +236,7 @@ def apply_padding(sorted_graph: List[Tensor], workdir: str = None) -> List[Tenso
             transform_utils.replace_tensor(original_output, new_output)
             transform_utils.remove_tensor_from_sorted_graph(original_output)
 
-            logger.debug(
-                __name__,
+            _LOGGER.debug(
                 "**** Apply padding ****, replace op \n {} \n with \n {} \n".format(
                     original_op_debug_str, new_op
                 ),
diff --git a/python/aitemplate/compiler/transform/bind_constants.py b/python/aitemplate/compiler/transform/bind_constants.py
index 7ff6fe9c1..3f100ce6c 100644
--- a/python/aitemplate/compiler/transform/bind_constants.py
+++ b/python/aitemplate/compiler/transform/bind_constants.py
@@ -35,7 +35,8 @@ def bind_constants(graph: List[Tensor], constants: Dict[str, TorchTensor]) -> No
         Constants to bind
 
     """
-
+    if not constants:
+        return
     for tensor in graph:
         name = tensor._attrs["name"]
         if name not in constants:
diff --git a/python/aitemplate/compiler/transform/constant_folding.py b/python/aitemplate/compiler/transform/constant_folding.py
index 0b6459750..e0dc7e6fc 100644
--- a/python/aitemplate/compiler/transform/constant_folding.py
+++ b/python/aitemplate/compiler/transform/constant_folding.py
@@ -12,40 +12,118 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import os
-from typing import Dict, List
-
-import numpy as np
+from typing import Dict, List, Tuple
 
 from aitemplate import backend, compiler
 
-from aitemplate.compiler.base import _NumpyConstantTensorData, IntVarTensor, Tensor
-from aitemplate.compiler.model import AITData, Model
+from aitemplate.compiler.base import IntVarTensor, Tensor
+from aitemplate.compiler.transform.memory_planning import Workspace
 from aitemplate.compiler.transform.transform_utils import replace_tensor
-from aitemplate.utils import logger
+from aitemplate.utils import graph_utils
 
 
-def _output_from_tensor(tensor: Tensor) -> Tensor:
+_LOGGER = logging.getLogger(__name__)
+
+
+def _create_dummy_constant_folder():
+    model_container_generator = backend.codegen.ModelContainerGenerator(
+        max_blob_size=0,
+        max_constant_blob_size=0,
+        workspace=Workspace(0, 0),
+        constants_data_file=None,
+        graph=[],
+        output_tensors=[],
+        model_name=backend.codegen.CONSTANT_FOLDER_MODEL_NAME,
+    )
+    return model_container_generator.generate_model()
+
+
+def _make_op_names_unique(graph: List[Tensor]) -> Dict[str, str]:
+    """
+    To avoid ODR issues, we rename all ops in the constant folding subgraph.
+    ODR issues can arise if two ops end up sharing the same name & implementation (which
+    can actually happen, e.g. in the proposal op).
+    """
+    new_name_to_old = {}
+    for tensor in graph:
+        for op in tensor._attrs["src_ops"]:
+            if op._attrs["name"] not in new_name_to_old:
+                new_name = f"{op._attrs['name']}_constant_folding"
+                new_name_to_old[new_name] = op._attrs["name"]
+                op._attrs["name"] = new_name
+    return new_name_to_old
+
+
+def _rename_ops(graph: List[Tensor], new_name_to_old: Dict[str, str]) -> None:
+    for tensor in graph:
+        for op in tensor._attrs["src_ops"]:
+            if op._attrs["name"] in new_name_to_old:
+                op._attrs["name"] = new_name_to_old[op._attrs["name"]]
+
+
+def _non_output_from_tensor(tensor: Tensor) -> Tensor:
     new_tensor = Tensor(
         shape=tensor._attrs["shape"],
         name=tensor._attrs["name"],
         src_ops=tensor._attrs["src_ops"].copy(),
         dst_ops=tensor._attrs["dst_ops"].copy(),
         dtype=tensor._attrs["dtype"],
-        is_output=True,
         is_view_of=tensor._attrs["is_view_of"],
+        is_internal_constant=tensor._attrs["is_internal_constant"],
     )
-    if new_tensor._attrs["is_view_of"] is not None:
-        # If this tensor is a view, we need to set external_tensor
-        # so codegen handles the "output is view of output" case
-        # correctly.
-        new_tensor._attrs["external_tensor"] = new_tensor._attrs["is_view_of"]
+    new_tensor._attrs["is_param"] = tensor._attrs["is_param"]
+    new_tensor._attrs["data"] = tensor._attrs["data"]
+    new_tensor._attrs["external_tensor"] = tensor._attrs["external_tensor"]
+    return new_tensor
+
+
+def _output_from_tensor(tensor: Tensor) -> Tensor:
+    new_tensor = _non_output_from_tensor(tensor)
+    new_tensor._attrs["is_output"] = True
     return new_tensor
 
 
+def _fix_op_inputs_outputs(
+    subgraph: List[Tensor], name_to_new_tensor: Dict[str, Tensor]
+) -> None:
+    """
+    This is an unfortunate hack made necessary by the following:
+
+    1) When constructing the constant folding subgraph, the most understandable
+       thing to do is create *new* tensors so we can modify their attributes without
+       affecting the original graph.
+    2) However, the inputs of each tensor's src and dst ops need to be wired up to
+       the new tensors since the memory planning pass will traverse the graph through those attributes.
+
+    So, we store the mapping from tensor name to its corresponding subgraph tensor and the tensor in
+    original graph.
+
+    Before we do memory planning for constant folding, we call:
+      _fix_op_inputs_outputs(subgraph, name_to_constant_folding_tensor)
+
+    And then afterwards we restore everything with:
+      _fix_op_inputs_outputs(subgraph, name_to_original_tensor)
+
+    It would be nice if we could deep copy the src and dst ops when we create new tensors so we can
+    skip the restoration step. But this is not implemented and not trivial. Thankfully, this function
+    is not too hard to understand once the rationale behind it is understood.
+    """
+    ops = graph_utils.get_sorted_ops(subgraph)
+    for op in ops:
+        op._attrs["inputs"] = [
+            name_to_new_tensor[tensor._attrs["name"]] for tensor in op._attrs["inputs"]
+        ]
+
+        op._attrs["outputs"] = [
+            name_to_new_tensor[tensor._attrs["name"]] for tensor in op._attrs["outputs"]
+        ]
+
+
 def _extract_foldable_subgraph(
     sorted_graph: List[Tensor],
-) -> List[Tensor]:
+) -> Tuple[List[Tensor], Dict[str, Tensor], List[Tensor]]:
     """
     Extract a list of foldable nodes. A node is foldable if:
     * It has bound data, or
@@ -61,20 +139,18 @@ def _extract_foldable_subgraph(
     back into the final graph.
     """
     foldable_node_names = set()
+    foldable_ops = set()
     subgraph = []
 
     for tensor in sorted_graph:
-        if tensor._attrs["is_input"]:
+        if tensor._attrs["is_input"] or tensor._attrs["skip_constant_folding"]:
             continue
 
         name = tensor._attrs["name"]
-        if tensor._attrs["data"] is not None:
+        if tensor._attrs["data"] is not None or tensor._attrs["is_param"]:
             foldable_node_names.add(name)
             subgraph.append(tensor)
             continue
-        elif tensor._attrs["is_param"]:
-            # Params that do not have bound data cannot be folded.
-            continue
         elif isinstance(tensor, IntVarTensor):
             continue
         foldable = all(
@@ -85,14 +161,61 @@ def _extract_foldable_subgraph(
 
         if foldable:
             foldable_node_names.add(name)
-            subgraph.append(_output_from_tensor(tensor))
+            subgraph.append(tensor)
+            for op in tensor._attrs["src_ops"]:
+                foldable_ops.add(op)
+
+    def _is_used_by_non_foldable_op(tensor: Tensor) -> bool:
+        for op in tensor._attrs["dst_ops"]:
+            if op not in foldable_ops:
+                return True
+        return False
+
+    def _is_used_by_foldable_op(tensor: Tensor) -> bool:
+        for op in tensor._attrs["dst_ops"]:
+            if op in foldable_ops:
+                return True
+        return False
+
+    # Now figure out which tensors can be marked as outputs.
+    filtered_subgraph = []
+    name_to_new_tensor = {}
+    name_to_old_tensor = {}
+    constant_folding_inputs = []
+
+    for tensor in subgraph:
+        name = tensor._attrs["name"]
+        new_tensor = None
 
-    return subgraph
+        if not tensor._attrs["is_param"] and (
+            _is_used_by_non_foldable_op(tensor) or tensor._attrs["is_output"]
+        ):
+            # Tensor is required outside of the subgraph, make it an output.
+            # Parameters don't need to be marked as outputs in the
+            # subgraph, we already know their values.
+            new_tensor = _output_from_tensor(tensor)
+
+        elif _is_used_by_foldable_op(tensor):
+            # No need to append constants that are not used by any foldable ops.
+            new_tensor = _non_output_from_tensor(tensor)
+            if new_tensor._attrs["is_param"]:
+                constant_folding_inputs.append(new_tensor)
+
+        if new_tensor is not None:
+            name_to_new_tensor[name] = new_tensor
+            name_to_old_tensor[name] = tensor
+            filtered_subgraph.append(new_tensor)
+
+    _fix_op_inputs_outputs(filtered_subgraph, name_to_new_tensor)
+    return filtered_subgraph, name_to_old_tensor, constant_folding_inputs
 
 
 def _constant_folding_impl(
-    sorted_graph: List[Tensor], workdir: str
-) -> Dict[str, Tensor]:
+    sorted_graph: List[Tensor],
+    workdir: str,
+    model_name: str,
+) -> Tuple[Dict[str, Tensor], List[Tuple[str, str]], List[Tensor]]:
+    model_dir = os.path.join(workdir, model_name)
 
     # Collect the set of output names before we do any transformations. We'll need this
     # if we end up turning outputs into constants. _extract_foldable_subgraph marks *all*
@@ -102,56 +225,64 @@ def _constant_folding_impl(
         tensor._attrs["name"] for tensor in sorted_graph if tensor._attrs["is_output"]
     }
 
-    subgraph = _extract_foldable_subgraph(sorted_graph)
+    (
+        subgraph,
+        name_to_old_tensor,
+        constant_folding_inputs,
+    ) = _extract_foldable_subgraph(sorted_graph)
     output_tensors = [tensor for tensor in subgraph if tensor._attrs["is_output"]]
     if not output_tensors:
-        logger.info(__file__, "No constants to fold, skipping constant folding.")
-        return {}
+        _LOGGER.info("No constants to fold, skipping constant folding.")
+        # Write a dummy constant folder so everything still compiles.
+        with open(os.path.join(model_dir, "constant_folder-generated.h"), "w") as f:
+            f.write(_create_dummy_constant_folder())
+        _fix_op_inputs_outputs(subgraph, name_to_old_tensor)
+        return {}, [], []
 
     blob, constant_blob, workspace = compiler.transform.memory_planning(subgraph)
-
-    constant_folding_workdir = os.path.join(workdir, "constant_folding")
-    os.makedirs(constant_folding_workdir, exist_ok=True)
-    file_pairs = backend.codegen.gen_function_src(subgraph, workdir, "constant_folding")
-    main_pairs = backend.codegen.gen_library_src(
-        subgraph,
+    new_name_to_old = _make_op_names_unique(subgraph)
+    file_pairs = backend.codegen.gen_function_src(subgraph, workdir, model_name)
+    model_container_generator = backend.codegen.ModelContainerGenerator(
         blob,
         constant_blob,
         workspace,
-        workdir,
-        output_tensors,
-        "constant_folding",
+        constants_data_file=None,
+        graph=subgraph,
+        output_tensors=output_tensors,
+        model_name=backend.codegen.CONSTANT_FOLDER_MODEL_NAME,
+        model_dir=model_dir,
     )
-    file_pairs.extend(main_pairs)
-    compile_engine = backend.builder.Builder()
-    so_name = os.path.join(constant_folding_workdir, "constant_folding.so")
-    compile_engine.make(file_pairs, "constant_folding.so", workdir, "constant_folding")
-    module = Model(so_name, num_runtimes=1)
+    model_container_generator.append_all_tensors()
+    constant_folding_model_def = model_container_generator.generate_model()
+    with open(os.path.join(model_dir, "constant_folder-generated.h"), "w") as f:
+        f.write(constant_folding_model_def)
 
-    outputs = {}
+    _fix_op_inputs_outputs(subgraph, name_to_old_tensor)
+    _rename_ops(subgraph, new_name_to_old)
     new_tensors = {}
     for tensor in subgraph:
-        if tensor._attrs["data"] is None:
+        if not tensor._attrs["is_param"]:
             name = tensor._attrs["name"]
-            shape = module.get_output_maximum_shape(tensor._attrs["name"])
-            arr = np.empty(shape, dtype=tensor._attrs["dtype"])
             new_tensor = Tensor(
                 shape=tensor._attrs["shape"],
                 name=name,
-                # copy dst_ops so we can modify the original tensor without affecting this one.
-                dst_ops=tensor._attrs["dst_ops"].copy(),
                 dtype=tensor._attrs["dtype"],
                 is_output=name in original_output_tensors,
             )
-            new_tensor._bind_data(_NumpyConstantTensorData(arr))
+            if name in model_container_generator.output_name_to_idx:
+                new_tensor._attrs[
+                    "constant_folding_output_idx"
+                ] = model_container_generator.output_name_to_idx[name]
             new_tensors[name] = new_tensor
-            outputs[name] = AITData(arr.ctypes.data, shape, tensor._attrs["dtype"])
 
-    module._run_with_outputs_on_host({}, outputs)
-    return new_tensors
+    return new_tensors, file_pairs, constant_folding_inputs
 
 
-def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
+def constant_folding(
+    sorted_graph: List[Tensor],
+    workdir: str,
+    model_name: str,
+) -> Tuple[List[Tensor], List[Tuple[str, str]], List[Tensor]]:
     """
     Fold and propagate constants.
 
@@ -164,14 +295,9 @@ def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     aborted and the graph is returned unchanged. All generated code
     is stored in workdir/constant_folding.
     """
-    try:
-        new_constants = _constant_folding_impl(sorted_graph, workdir)
-    except Exception as e:
-        logger.warning(
-            __file__,
-            f"Constant folding encountered an error: {e}. The graph will not be modified.",
-        )
-        return sorted_graph
+    new_constants, file_pairs, constant_folding_inputs = _constant_folding_impl(
+        sorted_graph, workdir, model_name
+    )
 
     # Replace ops with their folded values.
     for idx, tensor in enumerate(sorted_graph):
@@ -183,4 +309,8 @@ def constant_folding(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
 
     # Eliminate constants that are no longer used
     compiler.transform.remove_unused_ops(sorted_graph)
-    return compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph)
+    return (
+        compiler.transform.transform_utils.sanitize_sorted_graph(sorted_graph),
+        file_pairs,
+        constant_folding_inputs,
+    )
diff --git a/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
new file mode 100644
index 000000000..c07185184
--- /dev/null
+++ b/python/aitemplate/compiler/transform/dedup_make_jagged_ops.py
@@ -0,0 +1,315 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Deduplicate make_jagged ops in the graph.
+"""
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Set
+
+from aitemplate.compiler.base import IntVar, JaggedIntVar, Operator, Tensor
+
+from aitemplate.compiler.ops import make_jagged
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
+    remove_dst_op_from_tensor,
+    replace_tensor,
+    replace_tensor_for_op,
+    sanitize_sorted_graph,
+)
+from aitemplate.utils.graph_utils import get_sorted_ops
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@dataclass
+class MakeJaggedMetaData:
+    op: Operator
+    sources_list: List[Tensor]
+    offsets_list: List[Tensor]
+    outputs: List[Tensor]
+    jagged_int_var: JaggedIntVar
+
+
+def _get_make_jagged_metadata(
+    sorted_graph: List[Tensor],
+) -> Dict[IntVar, List[MakeJaggedMetaData]]:
+    """Collect metadata about the existing make_jagged ops in the graph.
+
+    The MakeJaggedMetaData instances, one per make_jagged op, are grouped
+    by the total_length dimension in the source input Tensors of the ops.
+    In case of multiple inputs, total_length dimension is the same in
+    every input. The metadata is used further to inform the transformation.
+    """
+    metadata = {}
+    for op in get_sorted_ops(sorted_graph):
+        if op._attrs["op"] == "make_jagged":
+            outputs = op._attrs["outputs"]
+            jagged_int_var = outputs[0]._attrs["shape"][0]
+            total_length = jagged_int_var.total_length()
+            num_sources = op._attrs["num_sources"]
+            if total_length not in metadata:
+                metadata[total_length] = []
+            metadata[total_length].append(
+                MakeJaggedMetaData(
+                    op=op,
+                    sources_list=op._attrs["inputs"][:num_sources],
+                    offsets_list=op._attrs["inputs"][num_sources:],
+                    outputs=outputs,
+                    jagged_int_var=jagged_int_var,
+                )
+            )
+
+    return metadata
+
+
+def _remove_make_jagged_ops(
+    make_jagged_metadata: Dict[IntVar, List[MakeJaggedMetaData]],
+    graph_inputs: Set[Tensor],
+    graph_outputs: Set[Tensor],
+):
+    """Remove the make_jagged ops from the graph where possible.
+
+    The individual make_jagged ops scattered over the graph are removed,
+    to be further replaced by a single make_jagged instance, per total_length
+    dimension, applied to all inputs with the total_length dimension at once.
+    The ops are considered group by group, where group is formed from
+    the ops with the same total_length dimension in the source Tensors.
+
+    The make_jagged ops in the group are not removed (and the respective
+    total_length key is popped from the make_jagged_metadata) if:
+
+        1. There is only one make_jagged op in the group.
+
+        2. There is a make_jagged op in the group connecting a
+           graph input to a graph output: can't be eliminated.
+
+        3. The total_length dimension representing the group is
+           not present in any of the graph inputs' shape.
+
+    In other cases, all make_jagged ops in the grpup are removed from the graph
+    (and the respective total_length key is kept in the make_jagged_metadata).
+    """
+    for total_length in list(make_jagged_metadata.keys()):
+        make_jagged_group = make_jagged_metadata[total_length]
+        assert len({d.jagged_int_var for d in make_jagged_group}) == 1, (
+            "All make_jagged ops applied to the sources with the "
+            "same total_length must produce the same jagged_int_var."
+        )  # this includes offsets identity check internally
+
+        if len(make_jagged_group) == 1:
+            _LOGGER.debug(
+                "There is only one make_jagged op in the group "
+                f"with {total_length=}: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        has_input_to_output_op = False
+        for data in make_jagged_group:
+            if any(s in graph_inputs for s in data.sources_list) and any(
+                o in graph_outputs for o in data.outputs
+            ):
+                has_input_to_output_op = True
+                break
+        if has_input_to_output_op:
+            _LOGGER.debug(
+                "There is a make_jagged op in the group with "
+                f"{total_length=} that maps a graph input to "
+                "a graph output: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        graph_input_with_total_length = False
+        for inp in graph_inputs:
+            shape = inp._attrs["shape"]
+            if shape and shape[0] == total_length:
+                graph_input_with_total_length = True
+                break
+        if not graph_input_with_total_length:
+            _LOGGER.debug(
+                "None of the graph inputs has the first dimension "
+                f"equal to {total_length=}: skipping the group."
+            )
+            make_jagged_metadata.pop(total_length)
+            continue
+
+        _LOGGER.debug(
+            f"Removing {len(make_jagged_group)} make_jagged ops "
+            f"in the group with {total_length=} from the graph."
+        )
+        for data in make_jagged_group:
+            for source, output in zip(data.sources_list, data.outputs):
+                replace_tensor(output, source)
+                remove_dst_op_from_tensor(source, data.op)
+
+
+def _apply_make_jagged_to_inputs(
+    make_jagged_metadata: Dict[IntVar, List[MakeJaggedMetaData]],
+    sorted_graph: List[Tensor],
+    graph_inputs: Set[Tensor],
+) -> Dict[IntVar, JaggedIntVar]:
+    """Apply new make_jagged ops to the (bundled) input source Tensors.
+
+    For each group of make_jagged ops that removed from the graph,
+    a new make_jagged op is applied to all graph inputs with the
+    corresponding total_length dimension. This way, the source Tensors
+    are converted to jagged Tensors right from the "beginning" of the
+    graph and can be used as jagged Tensors downstream.
+
+    Two points are worth mentioning:
+
+        1. Due to the fact that the new make_jagged op is applied to
+           *all* source inputs with the total_length dimension, it is
+           guaranteed that the offsets validation performed by the
+           make_jagged op's back-end will run before any of the
+           resulting jagged Tensors can be used downstream.
+
+        2. Because a single make_jagged op is applied to multiple
+           graph inputs, the make_jagged op's back-end kernel will
+           be launched only once to validate the offsets (the latter
+           are the same for every source input). This optimizes out
+           redundant validation of the same offsets.
+
+    The mapping of each total_length to the new JaggedIntVar (produced
+    by the corresponding new make_jagged op) is returned.
+    """
+    new_jagged_int_vars = {}
+    for total_length, make_jagged_group in make_jagged_metadata.items():
+        sources_list = []
+        for inp in graph_inputs:
+            shape = inp._attrs["shape"]
+            if shape and shape[0] == total_length:
+                sources_list.append(inp)
+
+        _LOGGER.debug(
+            "Adding a single make_jagged op for the source inputs "
+            f"{[source._attrs['name'] for source in sources_list]}."
+        )
+
+        data = make_jagged_group[0]
+        new_make_jagged_op = make_jagged(
+            batch_dim=data.jagged_int_var.batch_dim(),
+            jagged_dims=data.jagged_int_var.jagged_dims(),
+            check_sequence_lengths=all(
+                d.op._attrs["check_sequence_lengths"] for d in make_jagged_group
+            ),
+        )
+        jagged_tensors = new_make_jagged_op(
+            source=sources_list,
+            offsets_list=data.offsets_list,
+        )
+        jagged_int_var = jagged_tensors[0]._attrs["shape"][0]
+        new_jagged_int_vars[total_length] = jagged_int_var
+
+        for source, jagged in zip(sources_list, jagged_tensors):
+            for op in source._attrs["dst_ops"]:
+                if op is not new_make_jagged_op:
+                    replace_tensor_for_op(op, source, jagged)
+
+        sorted_graph.extend(jagged_tensors)
+
+    return new_jagged_int_vars
+
+
+def _replace_total_length_with_jagged_int_var(
+    new_jagged_int_vars: Dict[IntVar, JaggedIntVar],
+    sorted_graph: List[Tensor],
+    graph_inputs: Set[Tensor],
+):
+    """Replace total_length dimensions by the new JaggedIntVars.
+
+    As we've removed the internal make_jagged ops from the graph and
+    replaced their output jagged Tensors by the input source Tensors,
+    the latter have lost their JaggedIntVars. Here we replace the
+    total_length dimension in *every* non-input Tensor in the graph
+    by the corresponding new JaggedIntVar (produced by the new
+    make_jagged op applied to the bundled source inputs). This includes,
+    but is not limited to, the source inputs of the make_jagged ops
+    removed from within the graph in the beginning of the pass.
+    """
+    for total_length, new_jagged_int_var in new_jagged_int_vars.items():
+        for tensor in sorted_graph:
+            if tensor not in graph_inputs:
+                shape = tensor._attrs["shape"]
+                if shape and shape[0] == total_length:
+                    shape[0] = new_jagged_int_var
+
+
+def dedup_make_jagged_ops(
+    sorted_graph: List[Tensor],
+    workdir: str = None,
+) -> List[Tensor]:
+    """Deduplicate make_jagged ops in the graph.
+
+    The rationale is to eliminate redundant offset validation as
+    well as make the implicit jagged Tensors (sources) in the graph
+    explicit, by replacing their total_length dimension with the
+    corresponding JaggedIntVar.
+
+    The pass is performed in the following steps:
+
+        1. Collect the metadata of the existing make_jagged ops.
+        2. Remove make_jagged ops from the graph where possible.
+        3. Apply new make_jagged ops to the (bundled) source inputs.
+        4. Replace total_length dimensions with new JaggedIntVars.
+
+    See the docstrings of the individual steps' helper functions
+    above for more details.
+    """
+    make_jagged_metadata = _get_make_jagged_metadata(sorted_graph)
+
+    if not make_jagged_metadata:
+        _LOGGER.debug("No make_jagged ops in the graph: skipping.")
+        return sorted_graph
+
+    graph_inputs = {t for t in sorted_graph if t._attrs["is_input"]}
+    graph_outputs = {t for t in sorted_graph if t._attrs["is_output"]}
+
+    _remove_make_jagged_ops(
+        make_jagged_metadata,
+        graph_inputs,
+        graph_outputs,
+    )
+
+    if not make_jagged_metadata:
+        _LOGGER.debug(
+            "There are make_jagged ops in the graph, "
+            "but nothing to deduplicate: skipping."
+        )
+        return sorted_graph
+
+    # drop the removed make_jagged outputs
+    sorted_graph = sanitize_sorted_graph(sorted_graph)
+
+    new_jagged_int_vars = _apply_make_jagged_to_inputs(
+        make_jagged_metadata,
+        sorted_graph,
+        graph_inputs,
+    )
+    _replace_total_length_with_jagged_int_var(
+        new_jagged_int_vars,
+        sorted_graph,
+        graph_inputs,
+    )
+
+    # sort the new make_jagged outputs
+    sorted_graph = toposort(sorted_graph)
+    # name the new tensors + do sanity check
+    sorted_graph = sanitize_sorted_graph(sorted_graph)
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_bmm_permute.py b/python/aitemplate/compiler/transform/fuse_bmm_permute.py
new file mode 100644
index 000000000..a563a0cab
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_bmm_permute.py
@@ -0,0 +1,65 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Perform fusions for bmm + permute021 operators:
+    bmm_xxc + permute021 -> bmm_xxr
+    bmm_xxr + permute021 -> bmm_xxc
+"""
+from typing import List
+
+from aitemplate.compiler.base import Tensor
+
+from aitemplate.compiler.ops.gemm_universal import (
+    bmm_ccc,
+    bmm_ccr,
+    bmm_crc,
+    bmm_crr,
+    bmm_rcc,
+    bmm_rcr,
+    bmm_rrc,
+    bmm_rrr,
+)
+
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
+
+
+def fuse_bmm_permute(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Fuse bmm + permute021 ops. The second argument is unused, it's only
+    here to make the type of this function the same as the others called in optimize_graph.
+    """
+    ops_r = [
+        bmm_ccr,
+        bmm_crr,
+        bmm_rcr,
+        bmm_rrr,
+    ]
+
+    ops_c = [
+        bmm_ccc,
+        bmm_crc,
+        bmm_rcc,
+        bmm_rrc,
+    ]
+    patterns_cr = [((c_op(), permute021()), r_op) for c_op, r_op in zip(ops_c, ops_r)]
+    patterns_rc = [((r_op(), permute021()), c_op) for c_op, r_op in zip(ops_c, ops_r)]
+
+    sorted_graph = transform_simple_fusion_patterns(
+        sorted_graph, patterns_cr + patterns_rc
+    )
+
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
index 2db2a8c9e..9c192bc92 100644
--- a/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_conv_elementwise.py
@@ -17,14 +17,14 @@
 """
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
-from .fuse_conv_patterns import (
+from aitemplate.compiler.transform.fuse_conv_patterns import (
     get_conv2d_bias_elementwise_patterns,
     get_conv2d_bias_pattern,
     get_cuda_only_conv2d_bias_elementwise_patterns,
 )
-from .fuse_utils import transform_simple_fusion_patterns
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
 
 # pylint: disable=C0103,C0415,W0612
 
@@ -61,7 +61,7 @@ def fuse_conv_elementwise(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
     for func in funcs:
         sorted_graph = func(sorted_graph)
 
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "cuda":
         funcs = [
diff --git a/python/aitemplate/compiler/transform/fuse_conv_patterns.py b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
index fadc7f69b..99928d14a 100644
--- a/python/aitemplate/compiler/transform/fuse_conv_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_conv_patterns.py
@@ -12,9 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..ops.common import elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.conv import (
+from aitemplate.compiler.ops.common import elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.conv import (
     conv2d,
     conv2d_bias,
     conv2d_bias_add,
diff --git a/python/aitemplate/compiler/transform/fuse_expand_bmm.py b/python/aitemplate/compiler/transform/fuse_expand_bmm.py
new file mode 100644
index 000000000..180e4a28e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/fuse_expand_bmm.py
@@ -0,0 +1,122 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This pass performs the following fusion:
+    t0 = tensor([1, M, N])
+    x0 = expand(t0, [B, M, N])
+    x1 = bmm(x0, t1) # or x1 = bmm(t1, x0)
+==>
+    x1 = bmm(t0, t1) # or x1 = bmm(t1, t0)
+
+The basic idea behind the transformation is that we leverage bmm's
+broadcasting capability to achieve the same functionality as expand.
+"""
+from typing import List
+
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
+    remove_single_tensor_op_from_sorted_graph,
+    sanitize_sorted_graph,
+)
+
+
+def _can_fuse(expand_op: Operator, bmm_op: Operator) -> bool:
+    """
+    determine if expand_op and bmm_op can be fused
+    """
+    expand_output = expand_op._attrs["outputs"][0]
+    if expand_output._attrs["is_output"]:
+        return False
+    expand_inputs = expand_op._attrs["inputs"]
+    expand_input_shape = expand_inputs[0]._attrs["shape"]
+    expand_output_shape = expand_output._attrs["shape"]
+    # not valid for bmm
+    if len(expand_output_shape) != 3:
+        return False
+    if len(expand_input_shape) == 2:
+        # In this case, we are expanding the batch dim
+        assert (
+            expand_input_shape[0] == expand_output_shape[1]
+            and expand_input_shape[1] == expand_output_shape[2]
+        ), f"invalid {expand_input_shape=} and {expand_output_shape=}"
+        return True
+    # not valid for bmm
+    if len(expand_input_shape) != 3:
+        return False
+    if expand_op._attrs["dim_types"][0] != ExpandDimensionType.EXPAND_DIM:
+        return False
+    bmm_inputs = bmm_op._attrs["inputs"]
+    bmm_a = bmm_inputs[0]
+    bmm_b = bmm_inputs[1]
+    if expand_output is bmm_a:
+        return expand_output_shape[0] == bmm_a._attrs["shape"][0]
+    if expand_output is bmm_b:
+        return expand_output_shape[0] == bmm_b._attrs["shape"][0]
+    return False
+
+
+def fuse_expand_bmm(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Transform expand + bmm into a single bmm op.
+
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph
+    workdir : str, optional
+        workdir, by default None
+
+    Returns
+    -------
+    List[Tensor]
+        Optimized graph
+    """
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        op = list(src_ops)[0]
+        if op._attrs["op"] != "expand":
+            continue
+        expand_op = op
+        expand_output = expand_op._attrs["outputs"][0]
+        dst_ops = expand_output._attrs["dst_ops"]
+        if len(dst_ops) != 1:
+            continue
+        next_op = list(dst_ops)[0]
+        if not next_op._attrs["op"].startswith("bmm_"):
+            continue
+        if not _can_fuse(expand_op, next_op):
+            continue
+
+        for int_var_tensor in expand_op._attrs["inputs"][1:]:
+            int_var_tensor._attrs["dst_ops"].discard(expand_op)
+        expand_op._attrs["inputs"] = [expand_op._attrs["inputs"][0]]
+        remove_single_tensor_op_from_sorted_graph(expand_op)
+
+        old_tensor_accessors = next_op._attrs["input_accessors"]
+        assert (
+            old_tensor_accessors[0].stride_dim is None
+            and old_tensor_accessors[1].stride_dim is None
+        ), f"next_op {next_op._attrs['name']} tensor accessors are expected to be None"
+        bmm_inputs = next_op._attrs["inputs"]
+        # refresh tensor accessors, which will be used by codegen
+        next_op._attrs["input_accessors"] = [TensorAccessor(t) for t in bmm_inputs]
+
+    sorted_graph = toposort(sorted_graph)
+    return sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/fuse_group_ops.py b/python/aitemplate/compiler/transform/fuse_group_ops.py
index c954167d1..63c45fb99 100644
--- a/python/aitemplate/compiler/transform/fuse_group_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_group_ops.py
@@ -16,17 +16,22 @@
 Horizontal fusion pass to group ops together.
 """
 import collections
+import logging
 import os
 from typing import Callable, List, OrderedDict, Set
 
-from ...utils import graph_utils, logger
-from ...utils.shape_utils import all_static_dimensions
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_universal.gemm_common import default_align_ab
-from . import transform_utils
-from .fuse_split import _can_fuse_split_op
-from .toposort import toposort
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import default_align_ab
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.fuse_split import _can_fuse_split_op
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import all_static_dimensions
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 # used by debugging only
@@ -41,7 +46,7 @@ def _dump_dependency_graph(graph, op_type, postfix, workdir):
 
     with open(file_path, "w") as f:
         f.write("\n\n".join(graph_str))
-        logger.info(__file__, f"Dumped dependency graph to {file_path}")
+        _LOGGER.info(f"Dumped dependency graph to {file_path}")
 
 
 def _dump_groups(groups, op_type, workdir):
@@ -53,7 +58,7 @@ def _dump_groups(groups, op_type, workdir):
             f.write(f"[{single_group_str}]\n\n")
             f.write(graph_utils.sorted_op_pseudo_code(group))
             f.write("\n")
-        logger.info(__file__, f"Dumped groups to {file_path}")
+        _LOGGER.info(f"Dumped groups to {file_path}")
 
 
 def _dump_single_group(group):
@@ -75,7 +80,7 @@ def _check_op_num_outputs(op: Operator, num_outputs: int) -> bool:
 def _get_ab_alignment(op: Operator) -> int:
     if op._attrs["op"].startswith("gemm_rcr"):
         k = op._attrs["inputs"][0]._size(1).value()
-        return default_align_ab(k, k)
+        return default_align_ab(k, k, op._attrs["inputs"][0].dtype())
     raise NotImplementedError(
         f"Need to add alignment check support for op {op._attrs['op']}"
     )
@@ -222,8 +227,24 @@ def _get_op_filter(op_type: str) -> Callable:
 }
 
 
+def _has_cycle(grouped_op: Operator, group: List[Operator]):
+    """
+    Assuming that grouped_op is in the group, determine if grouped_op
+    can reach any other op in the group. Return True if it can.
+    """
+    assert (
+        grouped_op in group
+    ), f'grouped_op {grouped_op._attrs["name"]} is not from the group'
+    for op in group:
+        if op is grouped_op:
+            continue
+        if transform_utils.is_ancestor(op, grouped_op):
+            return True
+    return False
+
+
 def _group_split_outputs_together(
-    sorted_ops: List[Operator], op_type: str
+    sorted_graph: List[Tensor], sorted_ops: List[Operator], op_type: str
 ) -> List[List[Operator]]:
     """As long as alignment allows, we group all output gemm ops from split op
     together to eliminate the cost of split. Here we don't exclude large gemms
@@ -255,7 +276,10 @@ def _group_split_outputs_together(
                     gemm_group.append(gemm_op)
                 else:
                     break
-        if len(gemm_group) == len(op._attrs["outputs"]):
+        if len(gemm_group) == len(op._attrs["outputs"]) and all(
+            not _has_cycle(grouped_op, gemm_group) for grouped_op in gemm_group
+        ):
+            _fuse_gemm_ops(gemm_group, sorted_graph)
             groups.append(gemm_group)
     return groups
 
@@ -373,6 +397,7 @@ def _get_sorted_candidate_ops(
 # the arguments to gpu memory with sync memcpy, which is bad for perf
 _MAX_LAYERNORM_GROUP = 39
 
+
 # TODO: remove after switching to async copy for group layernorm args
 def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
     if len(group) <= _MAX_LAYERNORM_GROUP:
@@ -390,16 +415,16 @@ def _break_layernorm_groups(group: List[Operator]) -> List[List[Operator]]:
 
 def _group_ops_by_type(
     sorted_graph: List[Tensor], op_type: str, workdir: str = None
-) -> List[List[Operator]]:
-    """Find all groups of ops that can be fused together. Each group is replaced
-    with 1 group op.
+) -> bool:
+    """Find and fuse all groups of ops that can be fused together.
+    Each group is replaced with 1 group op.
 
     Args:
         sorted_graph (List[Tensor]): Topologically sorted input graph
         op_type (str): The type of op to be grouped
 
     Returns:
-        List[List[Operator]]: All groups of ops that can be grouped together.
+        True if we fused any group.
 
     The algorithm can be described as:
     0) Let groups = []
@@ -440,13 +465,15 @@ def _group_ops_by_type(
 
     # There is no op with op_type in the graph
     if len(dependency_graph) == 0:
-        return []
+        return False
 
     if workdir:
         _dump_dependency_graph(dependency_graph, op_type, "filtered", workdir)
 
     f_filter_op = _get_op_filter(op_type)
     f_check_ops_are_compatible = _get_op_checker(op_type)
+    is_layernorm = op_type.startswith("layernorm")
+    f_fuse_ops = _fuse_layernorm_ops if is_layernorm else _fuse_gemm_ops
 
     sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
 
@@ -458,7 +485,7 @@ def _group_ops_by_type(
     groups = []
 
     # applies to group gemms only
-    split_groups = _group_split_outputs_together(sorted_ops, op_type)
+    split_groups = _group_split_outputs_together(sorted_graph, sorted_ops, op_type)
     for group in split_groups:
         groups.append(group)
         for op in group:
@@ -503,15 +530,45 @@ def get_op_number(op: Operator) -> int:
 
                 # must merge descendants together
                 descendants.update(dependency_graph[candidate])
-
+        # remove any op that may introduce a cycle because of grouping ops
+        group_op_idx = 0
+        while group_op_idx < len(group):
+            grouped_op = group[group_op_idx]
+            if _has_cycle(grouped_op, group):
+                del group[group_op_idx]
+            else:
+                group_op_idx += 1
+
+        # We fuse each group right after we form it. Otherwise, _has_cycle may
+        # miss cycles within groups. For example, see the graph below:
+        #
+        #        A --> C ---
+        #                  |
+        #    --> B --> D   |
+        #    |             |
+        #    --- X --> M   |
+        #                  |
+        #        Y --> N <--
+        #
+        # If we fuse (A, B) and (X, Y) at the same time, we would end up with a
+        # cycle between the fused op (A, B) and (X, Y). On the other hand, if we
+        # fuse (A, B) first, and then check _has_cycle before fusing (X, Y), we
+        # will be able to detect the cycle.
         if len(group) > _MAX_LAYERNORM_GROUP and op_type.startswith("layernorm"):
-            groups.extend(_break_layernorm_groups(group))
+            new_groups = _break_layernorm_groups(group)
+            for new_group in new_groups:
+                f_fuse_ops(new_group, sorted_graph)
+            groups.extend(new_groups)
         elif len(group) >= 2:
+            f_fuse_ops(group, sorted_graph)
             groups.append(group)
 
         grouped[op] = True
 
-    return groups
+    if workdir:
+        _dump_groups(groups, op_type, workdir)
+
+    return len(groups) > 0
 
 
 def _fuse_layernorm_ops(
@@ -650,19 +707,10 @@ def _fuse_group_ops_by_type(
     2) fuse them together
     Details of step 1 can be found in _group_ops_by_type
     """
-    groups = _group_ops_by_type(sorted_graph, op_type, workdir)
-
-    if len(groups) == 0:
+    # if we didn't fuse any grouped ops, we simply return original sorted_graph
+    if not _group_ops_by_type(sorted_graph, op_type, workdir):
         return sorted_graph
 
-    if workdir:
-        _dump_groups(groups, op_type, workdir)
-
-    is_layernorm = op_type.startswith("layernorm")
-    f_fuse_ops = _fuse_layernorm_ops if is_layernorm else _fuse_gemm_ops
-    for op_group in groups:
-        f_fuse_ops(op_group, sorted_graph)
-
     sorted_graph = toposort(sorted_graph)
     sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
     return sorted_graph
@@ -706,7 +754,7 @@ def fuse_group_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tens
     """
     # gemms need to be fused first
     # TODO: enable after adding heuristics and fixing dynamic shapes
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "cuda":
         if "fuse_group_gemm" in Target.current()._kwargs:
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
index 1c317aed2..a31661e61 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise.py
@@ -17,18 +17,20 @@
 """
 from typing import List
 
-from ..base import Tensor
-from ..ops.common import elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.gemm_universal import gemm_rcr, gemm_rcr_bias, gemm_rcr_bias_swish
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.gemm_universal import gemm_rcr_bias_swish
 
-from .fuse_mm_elementwise_patterns import get_patterns
-from .fuse_utils import (
+from aitemplate.compiler.transform.fuse_mm_elementwise_patterns import (
+    get_gemm_rcr_bias_patterns,
+    get_patterns,
+)
+from aitemplate.compiler.transform.fuse_utils import (
     extract_only_one_op,
     is_elementwise_type,
     transform_simple_fusion_patterns,
 )
-from .transform_utils import (
+from aitemplate.compiler.transform.transform_utils import (
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
     remove_single_tensor_op_from_sorted_graph,
@@ -178,14 +180,7 @@ def _fuse_gemm_rcr_bias_swish(sorted_graph: List[Tensor]) -> List[Tensor]:
 
 
 def _transform_gemm_bias(sorted_graph: List[Tensor]) -> List[Tensor]:
-    gemm_rcr_bias_patterns = [
-        (
-            (gemm_rcr(), elementwise(FuncEnum.ADD)),
-            gemm_rcr_bias,
-        ),
-    ]
-
-    return transform_simple_fusion_patterns(sorted_graph, gemm_rcr_bias_patterns)
+    return transform_simple_fusion_patterns(sorted_graph, get_gemm_rcr_bias_patterns())
 
 
 def _transform_mm_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
diff --git a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
index c97cfbfbe..08924078d 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_elementwise_patterns.py
@@ -12,15 +12,16 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..ops.common import elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.gemm_universal import (
+from aitemplate.compiler.ops.common import elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.gemm_universal import (
     bmm_ccr,
     bmm_ccr_add,
     bmm_crr,
     bmm_crr_add,
     bmm_rrr,
     bmm_rrr_add,
+    gemm_rcr,
     gemm_rcr_bias,
     gemm_rcr_bias_add,
     gemm_rcr_bias_add_add,
@@ -39,6 +40,16 @@
 )
 
 
+def get_gemm_rcr_bias_patterns():
+    gemm_rcr_bias_patterns = [
+        (
+            (gemm_rcr(), elementwise(FuncEnum.ADD)),
+            gemm_rcr_bias,
+        ),
+    ]
+    return gemm_rcr_bias_patterns
+
+
 def get_patterns():
     """
     We create the pattern of fusion here.
diff --git a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
index 7985ef354..e03023a54 100644
--- a/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
+++ b/python/aitemplate/compiler/transform/fuse_mm_reshape_permute.py
@@ -17,11 +17,13 @@
 """
 from typing import List, Sequence
 
-from ...utils import graph_utils
-from ..base import IntImm, Operator, Tensor
-from ..ops import gemm_rcr_permute
-from . import transform_utils
-from .toposort import toposort
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops import gemm_rcr_permute
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.testing import detect_target
+
+from aitemplate.utils import graph_utils
 
 
 def _check_reshape(op: Operator) -> bool:
@@ -134,7 +136,7 @@ def _fuse_gemm_reshape_permute0213(
 
         permute_op = list(reshape_output.dst_ops())[0]
 
-        if permute_op._attrs["op"] != "permute":
+        if permute_op._attrs["op"] not in ("permute", "permute0213"):
             continue
 
         permute_output = permute_op._attrs["outputs"][0]
@@ -143,14 +145,19 @@ def _fuse_gemm_reshape_permute0213(
         if not _check_reshape(reshape_op):
             continue
 
-        if not _check_permute(permute_op, [0, 2, 1, 3]):
+        # check permute dims match [0, 2, 1, 3]: either
+        # permute0213 or generic permute with those dims
+        if permute_op._attrs["op"] != "permute0213" and not _check_permute(
+            permute_op, [0, 2, 1, 3]
+        ):
             continue
 
         # fuse ops together
         _, d1, d2, _ = reshape_output.shape()
         d1_v = d1.value()
         d2_v = d2.value()
-        gemm_permute_op = gemm_rcr_permute(shape=(d1_v, d2_v), layout="0213")
+        layout = "20314" if detect_target().name() == "cuda" else "m2n3"
+        gemm_permute_op = gemm_rcr_permute(shape=(d1_v, d2_v), layout=layout)
         a, b = op._attrs["inputs"]
         transform_utils.remove_dst_op_from_tensor(a, op)
         transform_utils.remove_dst_op_from_tensor(b, op)
@@ -180,10 +187,13 @@ def fuse_mm_reshape_permute(
     Returns:
         List[Tensor]: optimized graph
     """
+    if detect_target().name() == "cuda":
+        funcs = [
+            _fuse_gemm_reshape_permute0213,
+        ]
+    else:
+        funcs = []
 
-    funcs = [
-        _fuse_gemm_reshape_permute0213,
-    ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_ops.py b/python/aitemplate/compiler/transform/fuse_ops.py
index 61db0f8b9..60a496553 100644
--- a/python/aitemplate/compiler/transform/fuse_ops.py
+++ b/python/aitemplate/compiler/transform/fuse_ops.py
@@ -15,58 +15,63 @@
 """
 Perform operator fusions.
 """
-from typing import Any, Dict, List, Set
-
-from aitemplate.compiler.base import Operator
+import collections
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Set
+
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.common import elementwise, fused_elementwise
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.ops.groupnorm.groupnorm import group_norm
+from aitemplate.compiler.ops.groupnorm.groupnorm_swish import group_norm_swish
+from aitemplate.compiler.ops.layernorm import layernorm_sigmoid_mul
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.fuse_utils import transform_simple_fusion_patterns
 from aitemplate.compiler.transform.toposort import toposort
 
-from aitemplate.utils import logger
+# pylint: disable=C0103,W0612
 
-from ..base import Tensor
-from ..ops.common import fused_elementwise
-from ..ops.common.epilogue import FuncEnum
-from ..ops.layernorm import layernorm_sigmoid_mul
-from . import transform_utils
 
-# pylint: disable=C0103,W0612
+_LOGGER = logging.getLogger(__name__)
 
 
-class SimpleDisjointSet(object):
+class SimpleDisjointSet:
     def __init__(self):
-        self.node_to_list_mapping: Dict[Any, List[Any]] = {}
+        self.node_to_set_mapping: Dict[Any, Set[Any]] = {}
 
-    def add(self, node: Any, dependent_nodes: Set[Any]) -> None:
-        if node in self.node_to_list_mapping:
+    def add(self, node: Any, dependent_nodes: Optional[Set[Any]]) -> None:
+        if node in self.node_to_set_mapping:
             return
 
         if dependent_nodes is None or len(dependent_nodes) == 0:
-            self.node_to_list_mapping[node] = [node]
+            self.node_to_set_mapping[node] = {node}
             return
 
-        current_list = None
+        current_set = {
+            node  # node should also be considered to decide if a new_set can be added.
+        }
         for dependent in dependent_nodes:
-            if dependent is None or dependent not in self.node_to_list_mapping:
+            if dependent is None or dependent not in self.node_to_set_mapping:
                 continue
-            new_list = self.node_to_list_mapping.get(dependent)
-            if current_list is None:
-                current_list = new_list
-            elif current_list is not new_list:
-                current_list.extend(new_list)
-                for new_node in new_list:
-                    self.node_to_list_mapping[new_node] = current_list
-        if current_list is None:
-            current_list = []
-        current_list.append(node)
-        self.node_to_list_mapping[node] = current_list
-
-    def get_node_groups(self) -> List[List[Any]]:
+            new_set = self.node_to_set_mapping.get(dependent)
+
+            if _detect_cycle(current_set | new_set):
+                continue
+
+            current_set.update(new_set)
+            for new_node in new_set:
+                self.node_to_set_mapping[new_node] = current_set
+        self.node_to_set_mapping[node] = current_set
+
+    def get_node_groups(self) -> List[Set[Any]]:
         node_groups = []
         visited = set()
-        for groups in self.node_to_list_mapping.values():
-            addr = id(groups)
+        for group in self.node_to_set_mapping.values():
+            addr = id(group)
             if addr not in visited:
                 visited.add(addr)
-                node_groups.append(groups)
+                node_groups.append(group)
         return node_groups
 
 
@@ -127,14 +132,179 @@ def _find_fusable_elementwise_ops(op: Operator) -> Set[Operator]:
         if prev_len == new_len:
             break
 
-    logger.debug(
-        __file__,
+    _LOGGER.debug(
         f"original op set: {original_ops}, to_be_removed_set: {to_be_removed_set}, final_set: {dependent_ops}",
     )
     return dependent_ops
 
 
-def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
+@dataclass
+class FusedElementwiseInfo:
+    partitioned_ops: List[Operator]
+    inputs: Set[Tensor]
+    outputs: Set[Tensor]
+    external_inputs: Set[Tensor]
+    external_outputs: Set[Tensor]
+
+
+def _partition_subgraphs(ops: Set[Operator]) -> Dict[str, Set[Operator]]:
+    """
+    Given ops of candidate graph of fused_elementwise op graph and partition
+    into subgraph based on output shape, returns dict of
+    {output shape: ops to form subgraph based on the shape}
+    """
+    # Partition graph of elementwise into subgraph based on output shape.
+    output_op_map = collections.defaultdict(set)
+    for op in ops:
+        shapes = []
+        # Find output nodes
+        for output_tensor in op._attrs["outputs"]:
+            if (
+                output_tensor._attrs["is_output"]
+                or len(output_tensor._attrs["dst_ops"] - ops) > 0
+            ):
+                shapes.append("_".join(map(str, output_tensor._attrs["shape"])))
+        # Find anscestor of output node.
+        # Outputs with the same shape should form the same graph
+        if shapes:
+            key = "|".join(shapes)
+            op_set = output_op_map[key]
+            for anc_op in ops:
+                if transform_utils.is_ancestor(anc_op, op):
+                    op_set.add(anc_op)
+            op_set.add(op)
+    return output_op_map
+
+
+def _get_inputs_outputs(
+    partitioned_ops: Set[Operator], all_ops: Set[Operator]
+) -> List[Set[Tensor]]:
+    """
+    Given ops of a partitioned subgraph based on output shape, and ops of full graph
+    to form a complete graph with fused_elementwise op, returns all inputs/outputs of
+    the ops and the external input/output of the subgraph, which will serve as input/output
+    of fused_elementwise op.
+    """
+    external_inputs = set()
+    external_outputs = set()
+    tmp_inputs = set()
+    tmp_outputs = set()
+
+    for op in partitioned_ops:
+        for input_tensor in op._attrs["inputs"]:
+            tmp_inputs.add(input_tensor)
+            src_ops = set(input_tensor._attrs["src_ops"])
+            if (len(src_ops) == 0 or len(src_ops - all_ops) > 0) and (
+                not input_tensor.is_a_const_num()
+            ):
+                external_inputs.add(input_tensor)
+            assert op in input_tensor._attrs["dst_ops"]
+        for output_tensor in op._attrs["outputs"]:
+            tmp_outputs.add(output_tensor)
+            dst_ops = set(output_tensor._attrs["dst_ops"])
+            if output_tensor._attrs["is_output"] or len(dst_ops - all_ops) > 0:
+                external_outputs.add(output_tensor)
+            assert len(output_tensor._attrs["src_ops"]) == 1
+            assert list(output_tensor._attrs["src_ops"])[0] == op
+
+    assert (
+        external_inputs == tmp_inputs - tmp_outputs
+    ), "external_inputs: {} is not equal to tmp_inputs: {} - tmp_outputs: {}.".format(
+        external_inputs, tmp_inputs, tmp_outputs
+    )
+    assert (
+        len(tmp_outputs - tmp_inputs - external_outputs) == 0
+    ), "tmp_outputs: {} - tmp_inputs: {} - external_outputs: {} is not empty.".format(
+        tmp_outputs, tmp_inputs, external_outputs
+    )
+    assert (
+        len(external_outputs - tmp_outputs) == 0
+    ), "external_outputs: {} - tmp_outputs: {} is not empty.".format(
+        external_outputs, tmp_outputs
+    )
+
+    return [tmp_inputs, tmp_outputs, external_inputs, external_outputs]
+
+
+def _collect_info(
+    output_op_map: Dict[str, Set[Operator]],
+    all_ops: Set[Operator],
+    sorted_graph: List[Tensor],
+) -> List[FusedElementwiseInfo]:
+    """
+    Collects information for each fused_elementwise op:
+        1. Provide op_list in topological order so fuse_elementwise backend can emit operations in order.
+        2. Provide inputs outputs info of each subgraph. This need to happen before fuse ops are created,
+        i.e. graph get changed.
+    Returns list of fused_op_info, which contains:
+        partitioned op list in topological order, all inputs/outputs of elementwise ops and
+        their external input/output, serving as input/output of fused_elementwise op.
+    """
+    info_list = []
+    for op_set in output_op_map.values():
+        # Toposort the op_set into op_list
+        # because fuse_elementwise stores elementwise ops in topological order
+        topo_set = set()
+        op_list = []
+        for tensor in sorted_graph:
+            topo_set.add(tensor)
+            to_remove = set()
+            for op in op_set:
+                if all([arg in topo_set for arg in op._attrs["inputs"]]):
+                    op_list.append(op)
+                    to_remove.add(op)
+            op_set = op_set - to_remove
+        assert (
+            not op_set
+        ), "Unable to find topological order of op list for fused_elementwise!"
+        # Get all inputs/outputs of elementwise ops and their external input/output,
+        # which will serve as input/output of fused_elementwise op.
+        inputs_outputs = _get_inputs_outputs(op_list, all_ops)
+        fused_op_info = FusedElementwiseInfo(op_list, *inputs_outputs)
+        info_list.append(fused_op_info)
+    return info_list
+
+
+def _create_fuse_ops(info_list: List[FusedElementwiseInfo]) -> None:
+    """
+    Creates fused ops based on info we collected.
+    First is to update elementwise ops' inputs/outputs within the subgraph;
+    Second is to create fused_elementwise ops where their inputs/outputs
+    are external inputs/outputs of the subgraph.
+    """
+    for info in info_list:
+        op_set = set(info.partitioned_ops)
+        for tensor in info.inputs | info.outputs:
+            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - op_set
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - op_set
+        fused_elementwise(
+            info.partitioned_ops,
+            info.external_inputs,
+            info.external_outputs,
+        )
+
+
+def _detect_cycle(group: Set[Operator]) -> bool:
+    """
+    Given a group of ops, to detect if they would form cycles, i.e.
+      --> group_ops
+     /      /
+    A <-----
+    we need to find all parents of all ops in that group
+    and see if any parent's ancester (execluding the ones already in the group) exists in the group.
+    """
+    parents = [o for op1 in group for i in op1._attrs["inputs"] for o in i.src_ops()]
+    for op1 in group:
+        for op2 in set(parents) - group:
+            if transform_utils.is_ancestor(op1, op2):
+                return True
+    return False
+
+
+def fuse_elementwise(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Given a sorted graph, returns a sorted graph with fused_elementwise ops on fusable elementwise ops.
+    """
     disjoint_set = SimpleDisjointSet()
     for tensor in sorted_graph:
         src_ops = tensor._attrs["src_ops"]
@@ -142,11 +312,53 @@ def _fuse_elementwise(sorted_graph: List[Tensor]) -> List[Tensor]:
             continue
         src_op = list(src_ops)[0]
         if src_op._attrs["op"] == "elementwise":
-            disjoint_set.add(src_op, _find_fusable_elementwise_ops(src_op))
+            disjoint_set.add(
+                src_op,
+                _find_fusable_elementwise_ops(src_op),
+            )
 
     to_be_fused_op_groups = disjoint_set.get_node_groups()
+
     for ops in to_be_fused_op_groups:
-        fused_elementwise(ops)
+        # Partition subgraph based on output shape.
+        output_op_map = _partition_subgraphs(ops)
+        # Collect information to create fuse ops.
+        info_list = _collect_info(output_op_map, ops, sorted_graph)
+        # Create fuse ops.
+        _create_fuse_ops(info_list)
+
+    sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def process_singleton_elementwise(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """
+    A dummy pass which enables codegen for any elementwise op without fusing it with neighbors
+    """
+    disjoint_set = SimpleDisjointSet()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if src_ops is None or len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] == "elementwise":
+            disjoint_set.add(
+                src_op,
+                {src_op},
+            )
+
+    to_be_fused_op_groups = disjoint_set.get_node_groups()
+
+    for ops in to_be_fused_op_groups:
+        # Partition subgraph based on output shape.
+        # output_op_map = {op._attrs["op"]: set(op) for op in ops}
+        output_op_map = _partition_subgraphs(ops)
+        # Collect information to create fuse ops.
+        info_list = _collect_info(output_op_map, set(ops), sorted_graph)
+        # Create fuse ops.
+        _create_fuse_ops(info_list)
 
     sorted_graph = toposort(sorted_graph)
     return transform_utils.sanitize_sorted_graph(sorted_graph)
@@ -202,10 +414,25 @@ def _fuse_layernorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
+def _fuse_groupnorm_sigmoid_mul(sorted_graph: List[Tensor]) -> List[Tensor]:
+    fusion_patterns = [
+        (
+            (
+                group_norm(num_groups=2, num_channels=4),
+                elementwise(FuncEnum.SIGMOID),
+                elementwise(FuncEnum.MUL),
+            ),
+            group_norm_swish,
+        )
+    ]
+    graph = transform_simple_fusion_patterns(sorted_graph, fusion_patterns)
+    return graph
+
+
 def fuse_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
     funcs = [
         _fuse_layernorm_sigmoid_mul,
-        _fuse_elementwise,
+        _fuse_groupnorm_sigmoid_mul,
     ]
     for func in funcs:
         sorted_graph = func(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
index bdd0d6473..baf298e2f 100644
--- a/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
+++ b/python/aitemplate/compiler/transform/fuse_parallel_gemms.py
@@ -18,15 +18,16 @@
 
 from typing import Callable, List, Tuple
 
-from ...utils import graph_utils
-from ...utils.shape_utils import is_static_dimension
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_universal.gemm_common import default_align_ab
-from ..tensor_accessor import TensorAccessor
-from . import transform_utils
-from .toposort import toposort
-from .transform_strided_ops import _is_supported_op
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal.gemm_common import default_align_ab
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_strided_ops import _is_supported_op
+
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import is_static_dimension
 
 
 def _is_same_shape(gemm_op1: Operator, gemm_op2: Operator) -> bool:
@@ -57,6 +58,10 @@ def _is_valid_gemm_op(tensor: Tensor, f_check_src_op: Callable) -> bool:
     if len(tensor.dst_ops()) != 1 or len(tensor.src_ops()) != 1:
         return False
 
+    # Don't fuse if tensor is an output tensor
+    if tensor._attrs["is_output"]:
+        return False
+
     gemm_op = list(tensor.src_ops())[0]
     if gemm_op._attrs["op"] != "gemm_rcr_bias":
         return False
@@ -170,8 +175,9 @@ def _merge_parallel_gemm_concat(
     n, k = weights[0].shape()[0].value(), weights[0].shape()[1].value()
     b = len(weights)
 
-    rcr_align = default_align_ab(k, k)
-    rrr_align = default_align_ab(k, n)
+    dtype = inputs[0].dtype()
+    rcr_align = default_align_ab(k, k, dtype)
+    rrr_align = default_align_ab(k, n, dtype)
 
     use_rcr = rcr_align > rrr_align
 
@@ -216,7 +222,7 @@ def _merge_parallel_gemm_concat(
 
         cat_op._attrs["inputs"] = new_inputs
         cat_op._attrs["input_accessors"] = [TensorAccessor(t) for t in new_inputs]
-        cat_op._attrs["original_inputs"] = new_inputs
+        cat_op._attrs["original_inputs"] = list(new_inputs)
         cat_op._attrs["input_masks"] = [True] * len(new_inputs)
 
         bmm_reshape._attrs["dst_ops"].add(cat_op)
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm.py
deleted file mode 100644
index 22a3ee036..000000000
--- a/python/aitemplate/compiler/transform/fuse_permute_bmm.py
+++ /dev/null
@@ -1,224 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-"""
-Perform fusions for permute+bmm operators.
-"""
-from typing import Callable, List, Optional, Set, Tuple, Type, Union
-
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal import (
-    bmm_ccr,
-    bmm_crr,
-    bmm_rcr,
-    bmm_rrr,
-    gemm_rcr,
-    gemm_rcr_bias,
-    gemm_rrr,
-    gemm_rrr_bias,
-)
-from ..ops.tensor import permute021
-from .fuse_utils import extract_only_one_op
-from .transform_utils import (
-    copy_src_op_attributes,
-    copy_tensor_attributes,
-    remove_dst_op_from_tensor,
-    remove_tensor_from_sorted_graph,
-    replace_tensor,
-    sanitize_sorted_graph,
-)
-
-# pylint: disable=C0103,W0612
-
-
-def _try_extract_one_mm_op(ops: Set[Union[None, Operator]]) -> Union[None, Operator]:
-    """
-    Helper function that returns the matmul op from src_ops() or dst_ops() call.
-    Return None if there's no bmm ops
-    """
-    if ops is None:
-        return None
-
-    for op in ops:
-        if op._attrs["op"].startswith("bmm") or op._attrs["op"].startswith("gemm"):
-            return op
-
-    return None
-
-
-def _fuse_permute_bmm_ops(
-    sorted_graph: List[Tensor],
-    source: List[Type[Operator]],
-    targets: List[Union[None, Type[Operator]]],
-    condition: Optional[Callable],
-) -> Tuple[bool, List[Tensor]]:
-    """
-    Function that fuses [permute021 + bmm] into corresponding bmm op.
-
-    Parameters
-    ----------
-    sorted_graph : List[Tensor]
-        AIT graph to run fusion
-    source: List[Type[Operator]]
-        Combination of permute+bmm ops to be fused.
-        This should be of len-2
-    targets: List[Type[Operator]]
-        To be fused bmm that matches the source.
-        This should be of len 2, which corresponds to the operator that does
-        permute A and permute B respectively
-    condition: Optional[Callable]
-        If not None, we apply on the gemm op to check whether it requires fusion.
-    """
-    assert len(source) == 2, "Source should have 2 elements, got {} instead".format(
-        len(source)
-    )
-
-    new_sorted_graph = []
-    fused = False
-    to_replace = {}
-    for tensor in sorted_graph:
-        if tensor in to_replace:
-            new_sorted_graph.append(to_replace[tensor])
-            replace_tensor(tensor, to_replace[tensor])
-            del to_replace[tensor]
-            continue
-        new_sorted_graph.append(tensor)
-
-        if fused:
-            continue
-        if tensor._attrs["is_output"]:
-            continue
-
-        permute_op = extract_only_one_op(tensor._attrs["src_ops"])
-        bmm_op = _try_extract_one_mm_op(tensor._attrs["dst_ops"])
-        if permute_op is None or bmm_op is None:
-            continue
-
-        if permute_op._attrs["op"] != source[0]()._attrs["op"]:
-            continue
-        if bmm_op._attrs["op"] != source[1]()._attrs["op"]:
-            continue
-        if condition is not None and not condition(bmm_op):
-            continue
-
-        assert len(permute_op._attrs["inputs"]) == 1
-        assert len(bmm_op._attrs["outputs"]) == 1
-
-        inputs = list(bmm_op._attrs["inputs"])
-        if targets[0] is None and inputs[0] == tensor:
-            continue
-        if targets[1] is None and inputs[1] == tensor:
-            continue
-
-        input_tensor = permute_op._attrs["inputs"][0]
-        output_tensor = bmm_op._attrs["outputs"][0]
-
-        # TODO: Check whether the input is weight to have better compile time
-        #       optimization on preprocessing of pad etc.
-        permute_shape = tensor.shape()
-        prepermute_shape = input_tensor.shape()
-
-        if (
-            isinstance(prepermute_shape[-1], IntImm)
-            and prepermute_shape[-1].value() % 2 == 1
-            and isinstance(permute_shape[-1], IntImm)
-            and permute_shape[-1].value() % 2 == 0
-        ):
-            # We don't run the permute+bmm fusion if the permute op could
-            # turn an odd alignment into even alignment.
-            continue
-
-        fused = True
-
-        remove_dst_op_from_tensor(bmm_op._attrs["inputs"], bmm_op)
-
-        target = None
-        if inputs[0] == tensor:
-            target = targets[0]
-            inputs[0] = input_tensor
-        elif inputs[1] == tensor:
-            target = targets[1]
-            inputs[1] = input_tensor
-        else:
-            raise RuntimeError(
-                "bmm inputs are {}, not matching permute's output tensor {}".format(
-                    inputs, tensor
-                )
-            )
-
-        if not tensor.dst_ops():
-            # Remove permute configs if this is the last bmm consuming the tensor
-            remove_dst_op_from_tensor(input_tensor, permute_op)
-            remove_tensor_from_sorted_graph(tensor)
-
-        new_tensor = target()(*inputs)
-        copy_tensor_attributes(new_tensor, output_tensor)
-        copy_src_op_attributes(new_tensor, output_tensor)
-        to_replace[output_tensor] = new_tensor
-
-    return (fused, sanitize_sorted_graph(new_sorted_graph))
-
-
-def fuse_permute_bmm(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
-    """Fuse [permute021 + bmm] into corresponding bmm op.
-
-    Parameters
-    ----------
-    sorted_graph : List[Tensor]
-        Input graph
-    workdir : str, optional
-        working dir, by default None
-
-    Returns
-    -------
-    List[Tensor]
-        Fused graph
-    """
-
-    def _need_broadcast_gemm(op: Operator):
-        if not op._attrs["op"].startswith("gemm"):
-            return False
-        inputs = op._attrs["inputs"]
-        return len(inputs[0].shape()) != 2 or len(inputs[1].shape()) != 2
-
-    permute_mm_patterns = (
-        ([permute021, bmm_ccr], [bmm_rcr, bmm_crr], None),
-        ([permute021, bmm_crr], [bmm_rrr, bmm_ccr], None),
-        ([permute021, bmm_rcr], [bmm_ccr, bmm_rrr], None),
-        ([permute021, bmm_rrr], [bmm_crr, bmm_rcr], None),
-        ([permute021, gemm_rcr], [bmm_ccr, bmm_rrr], _need_broadcast_gemm),
-        ([permute021, gemm_rrr], [bmm_crr, bmm_rcr], _need_broadcast_gemm),
-        (
-            [permute021, gemm_rcr_bias],
-            [ops.gemm_universal.bmm_ccr_add, ops.gemm_universal.bmm_rrr_add],
-            _need_broadcast_gemm,
-        ),
-        (
-            [permute021, gemm_rrr_bias],
-            [ops.gemm_universal.bmm_crr_add, None],
-            _need_broadcast_gemm,
-        ),
-    )
-
-    graph_transformed = True
-    while graph_transformed:
-        graph_transformed = False
-        for source, targets, condition in permute_mm_patterns:
-            fused, sorted_graph = _fuse_permute_bmm_ops(
-                sorted_graph, source, targets, condition
-            )
-            graph_transformed |= fused
-
-    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
index 4a09f5f5c..109a10100 100644
--- a/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
+++ b/python/aitemplate/compiler/transform/fuse_permute_bmm_and_gemm.py
@@ -17,11 +17,9 @@
 """
 from typing import Callable, List, Optional, Set, Tuple, Type, Union
 
-from aitemplate.compiler.ops.tensor.permute import permute
-
-from .. import ops
-from ..base import IntImm, Operator, Tensor
-from ..ops.gemm_universal import (
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops.gemm_universal import (
     bmm_ccr,
     bmm_crr,
     bmm_rcr,
@@ -31,9 +29,11 @@
     gemm_rrr,
     gemm_rrr_bias,
 )
-from ..ops.tensor import permute021
-from .fuse_utils import extract_only_one_op
-from .transform_utils import (
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.ops.tensor.permute import permute
+from aitemplate.compiler.transform.fuse_utils import extract_only_one_op
+from aitemplate.compiler.transform.transform_utils import (
     copy_src_op_attributes,
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
@@ -42,6 +42,8 @@
     sanitize_sorted_graph,
 )
 
+from aitemplate.utils import alignment
+
 # pylint: disable=C0103,W0612
 
 
@@ -135,16 +137,22 @@ def _fuse_permute_impl(
         # TODO: Check whether the input is weight to have better compile time
         #       optimization on preprocessing of pad etc.
         permute_shape = tensor.shape()
+        permute_dtype = tensor.dtype()
         prepermute_shape = input_tensor.shape()
+        prepermute_dtype = input_tensor.dtype()
 
         if (
             isinstance(prepermute_shape[-1], IntImm)
-            and prepermute_shape[-1].value() % 2 == 1
+            and (
+                not alignment.valid_alignment(
+                    prepermute_shape[-1].value(), prepermute_dtype
+                )
+            )
             and isinstance(permute_shape[-1], IntImm)
-            and permute_shape[-1].value() % 2 == 0
+            and alignment.valid_alignment(permute_shape[-1].value(), permute_dtype)
         ):
             # We don't run the permute+bmm fusion if the permute op could
-            # turn an odd alignment into even alignment.
+            # turn an invalid alignment into a valid alignment.
             continue
 
         fused = True
diff --git a/python/aitemplate/compiler/transform/fuse_split.py b/python/aitemplate/compiler/transform/fuse_split.py
index 6cb52e3c4..91aeac2d8 100644
--- a/python/aitemplate/compiler/transform/fuse_split.py
+++ b/python/aitemplate/compiler/transform/fuse_split.py
@@ -15,17 +15,22 @@
 """
 Perform transformations on ops which support strided inputs / outputs.
 """
+import logging
 from typing import List
 
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
 
-from ...utils import graph_utils, logger
-from ..base import IntImm, IntVar, Operator, Tensor
-from . import transform_strided_ops_utils, transform_utils
+from aitemplate.utils import alignment, graph_utils
 
 # pylint: disable=W0612
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def _can_fuse_split_op(split_op: Operator):
     split_dim = split_op._attrs["split_dim"]
     # FIXME: only support dim == 1 at the moment
@@ -133,7 +138,7 @@ def _valid_input(input_tensor):
 
 
 def _is_supported_op(op_type: str):
-    from ...backend.target import Target
+    from aitemplate.backend.target import Target
 
     if Target.current().name() == "rocm":
         return op_type == "bmm_softmax_bmm_permute"
@@ -148,41 +153,40 @@ def get_stride(t: Tensor, dim: int):
     return stride
 
 
-def _check_dim_alignment(shape: List[IntVar], dim_idx: int) -> bool:
+def _check_dim_alignment(shape: List[IntVar], dim_idx: int, dtype: str) -> bool:
     k_dim = shape[dim_idx]
     # skip dynamic dim
     if not isinstance(k_dim, IntImm):
         return False
     k_dim_val = k_dim._attrs["values"][0]
     # We cannot have mis-aligned K
-    if k_dim_val % 2 == 0:
-        return True
-    else:
-        return False
+    return alignment.valid_alignment(k_dim_val, dtype)
 
 
-def _check_alignment(op: Operator, offset: int):
+def _check_alignment(op: Operator, offset: int, total_elems_from_split_dim: int):
     # ops that support align=1
     if op._attrs["op"] == "bmm_rcr_n1":
         return True
 
-    # ops that don't support align=1
-    # TODO: adjust alignment requirement based on dtype. 2-elem-alignment is
-    # only required by fp16, because async.copy needs at least 32 bits.
-    # For fp32 dtype values, 1-elem-alignment is valid.
-    if offset % 2 != 0:  # fp16
+    dtype = op._attrs["inputs"][0].dtype()
+    # ops that don't have valid alignments
+    if not alignment.valid_alignment(offset, dtype):
         return False
+    if not alignment.valid_alignment(total_elems_from_split_dim, dtype):
+        return False
+    if op._attrs["op"] == "concatenate":
+        return True
     if op._attrs["op"] == "bmm_rrr_permute":
         a_shape = op._attrs["input_accessors"][0].original_shapes
         b_shape = op._attrs["input_accessors"][1].original_shapes
         # check K and N
-        return _check_dim_alignment(a_shape, dim_idx=2) and _check_dim_alignment(
-            b_shape, dim_idx=2
-        )
+        return _check_dim_alignment(
+            a_shape, dim_idx=2, dtype=dtype
+        ) and _check_dim_alignment(b_shape, dim_idx=2, dtype=dtype)
     if op._attrs["op"] == "bmm_rcr":
         a_shape = op._attrs["input_accessors"][0].original_shapes
         # check K
-        return _check_dim_alignment(a_shape, dim_idx=2)
+        return _check_dim_alignment(a_shape, dim_idx=2, dtype=dtype)
     if op._attrs["op"] == "bmm_softmax_bmm_permute":
         # a = (B, M, K), b = (B, N, K), c = (B, N, O)
         # t = bmm_rcr(a, b)
@@ -192,13 +196,13 @@ def _check_alignment(op: Operator, offset: int):
         c_shape = op._attrs["input_accessors"][2].original_shapes
         return (
             # check K for bmm_rcr((B, M, K), (B, N, K))
-            _check_dim_alignment(a_shape, dim_idx=2)
+            _check_dim_alignment(a_shape, dim_idx=2, dtype=dtype)
             and
             # check N for bmm_rrr((B, M, N), (B, N, O))
-            _check_dim_alignment(c_shape, dim_idx=1)
+            _check_dim_alignment(c_shape, dim_idx=1, dtype=dtype)
             and
             # check O for bmm_rrr((B, M, N), (B, N, O))
-            _check_dim_alignment(c_shape, dim_idx=2)
+            _check_dim_alignment(c_shape, dim_idx=2, dtype=dtype)
         )
 
     raise RuntimeError(f'Unexpected op type: {op._attrs["op"]}')
@@ -242,12 +246,26 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
         # We apply padding to bmm before this fuse_split pass. However, we may
         # still have mis-aligned accesses caused by offsets. This _check_alignment
         # filters out all bad cases.
+        total_elems_from_split_dim = (
+            stride * split_input._attrs["shape"][split_dim].value()
+        )
         for output in outputs:
             can_fuse_split &= len(output.dst_ops()) > 0 and all(
-                _is_supported_op(next_op._attrs["op"])
-                # need to pass the real offset to alignment checker
-                and _check_alignment(next_op, dim_offset * stride)
-                and len(output.dst_ops()) == 1
+                (
+                    _is_supported_op(next_op._attrs["op"])
+                    # need to pass the real offset to alignment checker
+                    and _check_alignment(
+                        next_op, dim_offset * stride, total_elems_from_split_dim
+                    )
+                    and len(output.dst_ops()) == 1
+                )
+                or (
+                    next_op._attrs["op"] == "concatenate"
+                    and next_op._attrs["concat_dim"] == split_dim
+                    and _check_alignment(
+                        next_op, dim_offset * stride, total_elems_from_split_dim
+                    )
+                )
                 for next_op in output.dst_ops()
             )
             for next_op in output.dst_ops():
@@ -263,7 +281,8 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
 
         if not can_fuse_split:
             continue
-        logger.debug(__file__, "Remove split from graph")
+
+        _LOGGER.debug("Remove split from graph")
         split_input.dst_ops().remove(split_op)
 
         for output, offset in zip(outputs, output_offsets):
@@ -275,7 +294,6 @@ def _fuse_split_and_strided_op(sorted_graph: List[Tensor]) -> List[Tensor]:
                         )
                         # update the graph
                         next_op._attrs["inputs"][idx] = split_input
-                        break
                 split_input.dst_ops().add(next_op)
 
         # remove split op
diff --git a/python/aitemplate/compiler/transform/fuse_utils.py b/python/aitemplate/compiler/transform/fuse_utils.py
index 4aa7ee25b..dae123d4c 100644
--- a/python/aitemplate/compiler/transform/fuse_utils.py
+++ b/python/aitemplate/compiler/transform/fuse_utils.py
@@ -14,10 +14,9 @@
 #
 from typing import Any, List, Optional, Set
 
-from ..base import Operator, Tensor
-from ..ops.conv.common_conv2d_bias_add_activation import conv2d_bias_add_activation
-from .toposort import toposort
-from .transform_utils import (
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_utils import (
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
     replace_tensor,
@@ -101,6 +100,7 @@ def transform_simple_fusion_patterns(
 ) -> List[Tensor]:
     output_tensors = []
     to_remove = set()
+    has_modified = False
     for tensor in sorted_graph:
         if tensor in to_remove:
             to_remove.remove(tensor)
@@ -120,6 +120,7 @@ def transform_simple_fusion_patterns(
         src_op = extract_only_one_op(tensor._attrs["src_ops"])
         inputs = list(src_op._attrs["inputs"])
         to_remove_dst_op[src_op] = list(inputs)
+        src_op_num_inputs = len(inputs)
 
         last_tensor = tensor
         to_remove_candidate.add(last_tensor)
@@ -158,26 +159,27 @@ def transform_simple_fusion_patterns(
         # A final check to make sure our replacement is valid.
         new_op = fusion_patterns[fusion_idx][1]
 
+        # For bias_add fusion, use is_valid_inputs
         check_inputs_func = getattr(new_op, "is_valid_inputs", None)
         if check_inputs_func is not None:
             valid, _ = check_inputs_func(*inputs)
             if not valid:
                 continue
-
-        # TODO: remove after broadcasting is supported
-        # special shape check for conv2d_bias_add_activation ops
-        if issubclass(new_op, conv2d_bias_add_activation):
-            assert len(inputs) >= 4, (
-                f"The number of inputs must be larger than 4 for conv2d_bias_add_activation "
-                f"family fusions. Current number of inputs: {len(inputs)}"
-            )
-            residual = inputs[3]
-            y = src_op._attrs["outputs"][0]
-            if y.shape() != residual.shape():
+        else:
+            # gemm/conv epilogue fusion with elementwise ops doesn't
+            # support broadcasting except for bias_add.
+            # Here we do assume that all other inputs are elementwise inputs.
+            cannot_fuse = False
+            for elementwise_input in inputs[src_op_num_inputs:]:
+                if tensor.shape() != elementwise_input.shape():
+                    cannot_fuse = True
+                    break
+            if cannot_fuse:
                 continue
 
         # inputs here might not be ready in graph. But we will toposort again
         # at end of pass so it's okay.
+        has_modified = True
         new_tensor = new_op(**src_op._get_op_attributes())(*inputs)
         copy_tensor_attributes(new_tensor, last_tensor)
         if new_tensor._attrs["is_output"]:
@@ -187,5 +189,7 @@ def transform_simple_fusion_patterns(
             remove_dst_op_from_tensor(tensors, dst_op)
         to_remove |= to_remove_candidate
 
-    new_sorted_graph = toposort(output_tensors)
-    return sanitize_sorted_graph(new_sorted_graph)
+    if has_modified:
+        sorted_graph = toposort(output_tensors)
+        sorted_graph = sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/mark_param_tensor.py b/python/aitemplate/compiler/transform/mark_param_tensor.py
index 104739908..677c1df93 100644
--- a/python/aitemplate/compiler/transform/mark_param_tensor.py
+++ b/python/aitemplate/compiler/transform/mark_param_tensor.py
@@ -17,7 +17,7 @@
 """
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103,W0613
 
diff --git a/python/aitemplate/compiler/transform/memory_planning.py b/python/aitemplate/compiler/transform/memory_planning.py
index 7b53a0a80..f07567e0d 100644
--- a/python/aitemplate/compiler/transform/memory_planning.py
+++ b/python/aitemplate/compiler/transform/memory_planning.py
@@ -16,14 +16,19 @@
 Graph pass for memory planning.
 """
 import bisect
+import logging
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import List
 
-from ..base import Operator, Tensor
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.utils.environ import multistream_max_mem_parallel_ops, multistream_mode
+from aitemplate.utils.graph_utils import split_simple_multistream_parallel_ops
 
 # pylint: disable=C0103
 
+_LOGGER = logging.getLogger(__name__)
+
 
 @dataclass
 class TensorUsageRecord:
@@ -188,7 +193,9 @@ def _compute_workspace(sorted_graph: List[Tensor]) -> Workspace:
     return Workspace(max_workspace, unique_workspace_size)
 
 
-def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
+def _greedy_by_size_memory_planning(
+    sorted_graph: List[Tensor], tensor_usage_records: List[TensorUsageRecord]
+):
     """
     based on the greedy-by-size algorithm for offset calculation described in
     the following paper:
@@ -196,11 +203,6 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
         Efficient Memory Management for Deep Neural Net Inference,
         https://arxiv.org/abs/2001.03288
     """
-    sorted_ops = []
-    for node in sorted_graph:
-        sorted_ops.extend(node.src_ops())
-    tensor_usage_records = _make_tensor_usage_records(sorted_ops)
-
     # sort tensor usage records in non-increasing order by their sizes
     sorted_tensor_usage_records = sorted(
         tensor_usage_records, key=lambda r: r.size, reverse=True
@@ -250,7 +252,10 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
     # now we assign blobs for weights and inputs
     constant_offset = 0
     for node in sorted_graph:
-        if node._attrs["data"] is not None:
+        if (
+            node._attrs["data"] is not None
+            or node._attrs["constant_folding_output_idx"] is not None
+        ):
             node._attrs["offset"] = constant_offset
             constant_offset += node.size_bytes(alignment=64)
 
@@ -265,12 +270,31 @@ def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
     return (max_blob, constant_offset, workspace)
 
 
+def greedy_by_size_memory_planning(sorted_graph: List[Tensor]):  # noqa: C901
+    """
+    based on the greedy-by-size algorithm for offset calculation described in
+    the following paper:
+        Yury Pisarchyk, Juhyun Lee,
+        Efficient Memory Management for Deep Neural Net Inference,
+        https://arxiv.org/abs/2001.03288
+    """
+    sorted_ops = []
+    for node in sorted_graph:
+        sorted_ops.extend(node.src_ops())
+    tensor_usage_records = _make_tensor_usage_records(sorted_ops)
+
+    return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records)
+
+
 def naive_memory_planning(sorted_graph: List[Tensor]):
     max_blob = 0
     offset = 0
     constant_offset = 0
     for node in sorted_graph:
-        if node._attrs["data"] is not None:
+        if (
+            node._attrs["data"] is not None
+            or node._attrs["constant_folding_output_idx"] is not None
+        ):
             node._attrs["offset"] = constant_offset
             constant_offset += node.size_bytes(alignment=64)
         elif not node._attrs["is_view_of"]:
@@ -282,8 +306,161 @@ def naive_memory_planning(sorted_graph: List[Tensor]):
     # workspace
     workspace = _compute_workspace(sorted_graph)
     assign_offsets_to_views_and_outputs(sorted_graph)
+
+    return (max_blob, constant_offset, workspace)
+
+
+def _make_tensor_usage_records_simple_multistream(
+    par_ops_seq: List[List[Operator]],
+) -> List[TensorUsageRecord]:
+    """
+    Generalized version of _make_tensor_usage_records() which
+    assumes that several ops may be executed on every step.
+
+    Simple multistream algo iteratively tracks sets of operators
+    that can be run in parallel independently on each iteration.
+
+    par_ops_seq contains lists of operators that can be run
+    in parallel on every algorithm iteration.
+
+    Technically, the regular _make_tensor_usage_records() version
+    is similar to the following one:
+
+       def _make_tensor_usage_records(sorted_ops):
+         par_ops_seq = [sorted_ops]
+         return _make_tensor_usage_records_simple_multistream(par_ops_seq)
+
+    This version is kept as a separate one, because multistreaming
+    feature is still somewhat experimental.
+    """
+
+    num_of_ops = len(par_ops_seq)
+    tensor_records = defaultdict(
+        lambda: TensorUsageRecord(
+            tensor=None, first_op_idx=num_of_ops, last_op_idx=-1, size=None
+        )
+    )
+
+    for op_idx, par_ops in enumerate(par_ops_seq):
+        for op in par_ops:
+            for tensor in op._attrs["inputs"] + op._attrs["outputs"]:
+                # Skip weights and inputs since we don't overwrite them.
+                # Note that it might be OK to overwrite inputs, but let's be
+                # consertative for now and not surprise users. We could always
+                # make a flag to do that later if it's needed.
+                if tensor._attrs["is_param"]:
+                    continue
+                name = tensor._attrs["name"]
+                this_tensor = tensor_records[name].tensor
+                if this_tensor is None:
+                    tensor_records[name].tensor = tensor
+                else:
+                    # make sure we didn't screw up anything
+                    assert (
+                        tensor == this_tensor
+                    ), f"existing tensor: {this_tensor}, new tensor: {tensor}, op: {op}"
+
+                first_op_idx = tensor_records[name].first_op_idx
+                last_op_idx = tensor_records[name].last_op_idx
+                tensor_records[name].first_op_idx = min(first_op_idx, op_idx)
+                tensor_records[name].last_op_idx = max(last_op_idx, op_idx)
+                # An output tensor's lifetime extends to the last op.
+                if tensor._attrs["is_output"]:
+                    tensor_records[name].last_op_idx = num_of_ops - 1
+
+                size = tensor_records[name].size
+                tensor_size = tensor.size_bytes(alignment=64)
+                if size is None:
+                    tensor_records[name].size = tensor_size
+                else:
+                    # make sure we didn't screw up anything
+                    assert size == tensor_size
+
+    # tensor views extend the lifetime of the original tensors
+    tensor_views = []
+    for name, tensor_record in tensor_records.items():
+        this_tensor = tensor_record.tensor
+        if this_tensor._attrs["is_view_of"]:
+            orig_tensor = _find_original_tensor(this_tensor)
+            # view of input
+            if orig_tensor._attrs["is_param"]:
+                continue
+            orig_tensor_name = orig_tensor._attrs["name"]
+            assert orig_tensor_name in tensor_records
+            tensor_records[orig_tensor_name].last_op_idx = max(
+                tensor_records[orig_tensor_name].last_op_idx, tensor_record.last_op_idx
+            )
+            tensor_views.append(name)
+
+    # remove tensor views from tensor_records
+    for name in tensor_views:
+        del tensor_records[name]
+
+    # sanity checks
+    # make sure we have valid indices and sizes
+    records = tensor_records.values()
+    for tensor, first_op_idx, last_op_idx, size in records:
+        assert tensor is not None
+        assert 0 <= first_op_idx < num_of_ops
+        assert 0 <= last_op_idx < num_of_ops
+        assert first_op_idx <= last_op_idx
+        assert size is not None
+
+    return list(records)
+
+
+def simple_multistream_memory_planning(sorted_graph: List[Tensor]):
+    """
+    A specialized case for simple multi-stream execution.
+    It uses more or slightly more GPU memory than greedy_by_size_memory_planner,
+    depending on the input graph, but still significantly less
+    than naive_memory_planning.
+    """
+    from aitemplate.utils.graph_utils import track_graph_timings
+
+    # track the sequence
+    time_stats = track_graph_timings(sorted_graph, {})
+
+    # sort all operators by parallel execution order
+    ops_by_order = defaultdict(list)
+    for (op, tracking) in time_stats.op_parallel_trackers.items():
+        ops_by_order[tracking.execution_order].append(op)
+
+    # convert Dict[int, List[Operator]] into List[List[Operator]]
+    max_parallel_ops = multistream_max_mem_parallel_ops()
+    par_ops_seq = split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops)
+
+    tensor_usage_records = _make_tensor_usage_records_simple_multistream(par_ops_seq)
+
+    return _greedy_by_size_memory_planning(sorted_graph, tensor_usage_records)
+
+
+def proxy_memory_planning(sorted_graph: List[Tensor]):
+    run_mode = multistream_mode()
+    if run_mode == 0:
+        # no multistream
+        max_blob, constant_offset, workspace = greedy_by_size_memory_planning(
+            sorted_graph
+        )
+    elif run_mode == 1:
+        # simple multistream
+        max_blob, constant_offset, workspace = simple_multistream_memory_planning(
+            sorted_graph
+        )
+    else:
+        # unsupported
+        raise Exception(f"Unsupported multistream mode ({run_mode})")
+
+    # print some statistics
+    _LOGGER.info(
+        f"Workspace shared_size={workspace.shared_size} unique_size={workspace.unique_size}"
+    )
+    _LOGGER.info(f"max_blob={max_blob} constant_offset={constant_offset}")
+
+    # done
     return (max_blob, constant_offset, workspace)
 
 
-memory_planning = greedy_by_size_memory_planning
+# memory_planning = greedy_by_size_memory_planning
 # memory_planning = naive_memory_planning
+memory_planning = proxy_memory_planning
diff --git a/python/aitemplate/compiler/transform/move_view_ops.py b/python/aitemplate/compiler/transform/move_view_ops.py
new file mode 100644
index 000000000..e49e1bc07
--- /dev/null
+++ b/python/aitemplate/compiler/transform/move_view_ops.py
@@ -0,0 +1,341 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This pass move any view op between two concatenate ops to the front of the
+first concatenate op if possible.
+"""
+import copy
+from typing import Callable, List, Optional, Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import shape_utils
+
+
+# TODO: support other view ops such as squeeze and unsqueeze
+_SUPPORTED_VIEW_OPS = ["reshape", "flatten"]
+
+
+def _make_input_view_shape(
+    cat_input: Tensor,
+    original_view_shape: List[IntVar],
+    cat_dim: int,
+    input_idx: int,
+) -> Optional[List[IntVar]]:
+    """
+    Assumes that there is a pattern like concat + view_op in the graph, we tries
+    to transform it into view_op + concat. However, it's not always valid to
+    perform such a transformation, because the concat's original inputs may
+    not be shape-compatible with the moved view_op. Currently, we only support
+    cases where a view_op only changes the dims after cat_dim and the dims after
+    the cat_dim must be static.
+
+    For example, for the following code:
+
+        x1 = Tensor(batch, 3 * 4)
+        x2 = Tensor(batch, 5 * 4)
+        concat_0 = concat([x1, x2], cat_dim=1)
+        reshape_1 = reshape(concat_0, [batch, 8, 4])
+
+    This function will generate shape [batch, 3, 4] for x1 and [batch, 5, 4]
+    for x2, respectively.
+
+    In contrast, if we have code like below:
+
+        x1 = tensor([batch, 16])
+        x2 = tensor([batch, 8])
+        cat_1 = concatenate([x1, x2], cat_dim=1)
+        reshape_2 = reshape(cat_1, [batch, 4, 6])
+
+    We would return None for both x1 and x2, because we cannot make valid reshape
+    ops for x1 and x2 while keeping the original semantics.
+
+    Parameters
+    ----------
+    cat_input: Tensor
+        a concat op's input for which we will generate a view op, e.g. the x1 or
+        x2 tensor in the example above
+    original_view_shape: List[IntVar]
+        the shape of the view op's output, where the view op consumes the concat's
+        output, e.g. the reshape op in the example above
+    cat_dim: int
+        the value of the cat_dim attribute of the concate op
+    input_idx: int
+        the index of the cat_input in the concat's inputs list
+    """
+    cat_input_shape = cat_input.shape()
+    if cat_dim >= len(cat_input_shape) or cat_dim >= len(original_view_shape):
+        return None
+    # make sure each dimension at the same index in front of the cat_dim is the same for
+    # both cat_input_shape and original_view_shape
+    for curr_cat_dim, orig_dim in zip(
+        cat_input_shape[:cat_dim], original_view_shape[:cat_dim]
+    ):
+        if curr_cat_dim != orig_dim:
+            return None
+    input_stride_at_cat_dim = shape_utils.get_static_stride(cat_input_shape, cat_dim)
+    # make sure all dimensions are static after cat_dim
+    if input_stride_at_cat_dim is None:
+        return None
+    orig_view_stride_at_cat_dim = shape_utils.get_static_stride(
+        original_view_shape, cat_dim
+    )
+    # make sure all dimensions are static after cat_dim
+    if orig_view_stride_at_cat_dim is None:
+        return None
+    new_input_view_shape = copy.deepcopy(original_view_shape)
+    cat_stride = cat_input_shape[cat_dim].value() * input_stride_at_cat_dim
+    if cat_stride % orig_view_stride_at_cat_dim != 0:
+        return None
+    orig_dim_name = original_view_shape[cat_dim]._attrs["name"]
+    new_input_view_shape[cat_dim] = IntImm(
+        cat_stride // orig_view_stride_at_cat_dim,
+        name=f'{orig_dim_name}_{cat_input._attrs["name"]}_{input_idx}',
+    )
+    return new_input_view_shape
+
+
+def _call_view_op(
+    view_op: Callable, view_output_shape: List[IntVar], input_tensor: Tensor
+) -> Tensor:
+    """
+    call the view_op with suitable arguments and return the output tensor
+    """
+    view_op_type = view_op._attrs["op"]
+    if view_op_type == "reshape":
+        output = view_op(input_tensor, view_output_shape)
+    elif view_op_type == "flatten":
+        output = view_op(input_tensor)
+    else:
+        raise AssertionError(f"unsupported {view_op_type=}")
+    return output
+
+
+def _try_move_view_op(
+    first_cat: Operator,
+    second_cat: Operator,
+    view_op: Operator,
+) -> bool:
+    """
+    Try to move the view_op to the front of the first_cat.
+    Return true if the transformation is successful, False otherwise.
+    """
+    cat_dim = first_cat._attrs["concat_dim"]
+    first_cat_output = first_cat._attrs["outputs"][0]
+    first_cat_output_shape = first_cat_output.shape()
+    # we might be able to support dynamic cat_dim, but let's be conservative
+    # for now
+    if not shape_utils.is_static_dimension(first_cat_output_shape, cat_dim):
+        return False
+    if second_cat._attrs["concat_dim"] != cat_dim:
+        return False
+    second_cat_output = second_cat._attrs["outputs"][0]
+    if not shape_utils.is_static_dimension(second_cat_output.shape(), cat_dim):
+        return False
+    # We are not always able to move the view op. For example, we cannot
+    # move the reshape to the front of cat_1 in the following code:
+    #    x1 = tensor([batch, 16])
+    #    x2 = tensor([batch, 8])
+    #    cat_1 = concatenate([x1, x2], cat_dim=1)
+    #    reshape_2 = reshape(cat_1, [batch, 4, 6])
+    #    x3 = tensor([batch, 2, 6])
+    #    cat_2 = concatenate([reshape_2, x3], cat_dim=1)
+    # Basically, we cannot reshape either x1 or x2 to a shape while
+    # keep cat_dim = 1, i.e. we cannot form a shape [batch, -1, 6] from
+    # either [batch, 16] or [batch, 8].
+    new_view_output_shapes = []
+    view_op_output = view_op._attrs["outputs"][0]
+    original_view_shape = view_op_output.shape()
+    for input_idx, first_cat_input in enumerate(first_cat._attrs["inputs"]):
+        input_view_shape = _make_input_view_shape(
+            first_cat_input, original_view_shape, cat_dim, input_idx
+        )
+        if input_view_shape is None:
+            return False
+        new_view_output_shapes.append(input_view_shape)
+    # Now we start modifying the graph.
+    # make a new output tensor for the first cat
+    new_first_cat_output = Tensor(
+        original_view_shape,
+        first_cat_output._attrs["name"],
+        dtype=first_cat_output.dtype(),
+    )
+    transform_utils.replace_tensor(first_cat_output, new_first_cat_output)
+    first_cat._attrs["outputs"][0] = new_first_cat_output
+    new_first_cat_output._attrs["src_ops"].add(first_cat)
+
+    for dst_op in new_first_cat_output._attrs["dst_ops"]:
+        dst_op_type = dst_op._attrs["op"]
+        if dst_op_type in _SUPPORTED_VIEW_OPS:
+            # we've ensured all view ops have the same output shape before entering
+            # this function, so it's safe to remove the old view ops
+            transform_utils.remove_view_op_from_sorted_graph(dst_op)
+        else:
+            # we need to place a view op as we've changed the concat's output shape
+            new_view_output = ops.reshape()(
+                new_first_cat_output, first_cat_output.shape()
+            )
+            transform_utils.replace_tensor_for_op(
+                dst_op, new_first_cat_output, new_view_output
+            )
+
+    # make a new view op for each first_cat's original input and place it between
+    # the original input and the first cat
+    new_first_cat_inputs = []
+    # The same tensor may be used multiple times by the first cat.
+    # We don't want to make one view op for each use, because it would
+    # prevent us from propagating those view ops to an upper level.
+    first_cat_input_to_view_output = {}
+    for first_cat_input, input_view_shape in zip(
+        first_cat._attrs["inputs"], new_view_output_shapes
+    ):
+        new_view_output = first_cat_input_to_view_output.get(first_cat_input, None)
+        if new_view_output is None:
+            new_view_op = type(view_op)(**view_op._get_op_attributes())
+            new_view_output = _call_view_op(
+                new_view_op, input_view_shape, first_cat_input
+            )
+            first_cat_input_to_view_output[first_cat_input] = new_view_output
+            new_view_output._attrs["dst_ops"].add(first_cat)
+            first_cat_input._attrs["dst_ops"].remove(first_cat)
+        new_first_cat_inputs.append(new_view_output)
+    first_cat._attrs["inputs"] = new_first_cat_inputs
+    first_cat._attrs["original_inputs"] = list(new_first_cat_inputs)
+    first_cat._attrs["input_accessors"] = [
+        TensorAccessor(inp) for inp in new_first_cat_inputs
+    ]
+    return True
+
+
+def _is_valid_cat_op(cat: Operator) -> bool:
+    """
+    Return true if the cat op is valid for moving the view op.
+    """
+    if cat._attrs["op"] != "concatenate":
+        return False
+    # skip if the cat has any fused strided op
+    if any(mask is False for mask in cat._attrs["input_masks"]):
+        return False
+    # If cat carries strided input_accessors or fused view ops, we skip it
+    if "input_accessors" in cat._attrs:
+        if any(
+            input_accessor.stride_dim is not None
+            or input_accessor.actual_shapes is not None
+            for input_accessor in cat._attrs["input_accessors"]
+        ):
+            return False
+    return True
+
+
+def _get_valid_view_op_and_second_cat(
+    view_ops: List[Operator],
+) -> Tuple[Operator, Operator]:
+    """
+    Return the view op and the second cat if we can find such a pair
+    """
+    view_op = None
+    second_cat = None
+    for a_view_op in view_ops:
+        view_op_output = a_view_op._attrs["outputs"][0]
+        next_next_ops = view_op_output._attrs["dst_ops"]
+        next_concats = [n for n in next_next_ops if n._attrs["op"] == "concatenate"]
+        # only allow a single concat in the view_op's dst_ops
+        if len(next_concats) != 1:
+            continue
+        if _is_valid_cat_op(next_concats[0]):
+            view_op = a_view_op
+            second_cat = next_concats[0]
+            break
+    return (view_op, second_cat)
+
+
+def _move_view_op_before_concat(
+    sorted_graph: List[Tensor],
+) -> Tuple[bool, List[Tensor]]:
+    """
+    Return a tuple of (bool, List[Tensor]), where True indicates the
+    graph has been successfully changed.
+    """
+    changed = False
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) == 0:
+            continue
+        first_cat = list(src_ops)[0]
+        if not _is_valid_cat_op(first_cat):
+            continue
+        first_cat_outputs = first_cat._attrs["outputs"]
+        if len(first_cat_outputs) != 1:
+            continue
+        first_cat_output = first_cat_outputs[0]
+        # If the first cat is a graph output, we cannot fuse it
+        if first_cat_output._attrs["is_output"]:
+            continue
+        next_ops = first_cat_output._attrs["dst_ops"]
+        if len(next_ops) == 0:
+            continue
+        # skip cases where the first cat op is directly connected with another cat op,
+        # because moving a view op between other two cat ops would insert a view op
+        # between the directly-connected cat ops. The transformed graph would contain
+        # a valid rewrite pattern which could trigger another re-write, and so on.
+        # Consequently, we would end up with an infinite rewriting loop, e.g.
+        # cat1 + reshape + cat2, cat1 + cat3 => cat1 + cat2, cat1 + reshape + cat3 =>
+        # cat1 + reshape + cat2, cat1 + cat3 => ...
+        concat_ops = [op for op in next_ops if op._attrs["op"] == "concatenate"]
+        if len(concat_ops) > 0:
+            continue
+        view_ops = [op for op in next_ops if op._attrs["op"] in _SUPPORTED_VIEW_OPS]
+        # skip if none of the next ops is one of the supported view ops
+        if len(view_ops) == 0:
+            continue
+        a_view_op = view_ops[0]
+        view_output_shape = a_view_op._attrs["outputs"][0].shape()
+        # handle a special case where the all view_ops have the same output shape
+        if len(view_ops) > 1 and not all(
+            shape_utils.is_same_shape(
+                vop._attrs["outputs"][0].shape(), view_output_shape
+            )
+            for vop in view_ops
+        ):
+            continue
+        if any(vop._attrs["outputs"][0]._attrs["is_output"] for vop in view_ops):
+            continue
+        view_op, second_cat = _get_valid_view_op_and_second_cat(view_ops)
+        if second_cat is None:
+            continue
+        if _try_move_view_op(first_cat, second_cat, view_op):
+            changed = True
+    return (changed, sorted_graph)
+
+
+def move_view_op_before_concat(
+    sorted_graph: List[Tensor], wordir: str = None
+) -> List[Tensor]:
+    """
+    This transformation turns "cat + view_op + cat" into "view_op + cat + cat".
+    The yielded pattern may be optimized further by the transform_memory_ops pass.
+    Note that this pass must be invoked before transform_strided_op_and_view_op
+    and transform_strided_ops.
+    """
+    changed = True
+    while changed:
+        changed, sorted_graph = _move_view_op_before_concat(sorted_graph)
+        if changed:
+            sorted_graph = toposort(sorted_graph)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/name_graph.py b/python/aitemplate/compiler/transform/name_graph.py
index 7819190ef..b2a13b359 100644
--- a/python/aitemplate/compiler/transform/name_graph.py
+++ b/python/aitemplate/compiler/transform/name_graph.py
@@ -15,10 +15,14 @@
 """
 Graph pass to assign names to a sorted graph.
 """
+import logging
 import re
 from typing import List
 
-from ..base import IntVarTensor, Tensor
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor, JaggedIntVar, Tensor
+from aitemplate.utils import graph_utils
+
+_LOGGER = logging.getLogger(__name__)
 
 # pylint: disable=C0103
 
@@ -28,6 +32,18 @@
 func_name_to_tensor_cnt = {}
 
 MEMO = set()
+user_provided_dim = set()
+
+
+def reset_name_counters():
+    global func_cnt
+    global tensor_cnt
+    global func_name_to_tensor_cnt
+    global MEMO
+    func_cnt = 0
+    tensor_cnt = 0
+    func_name_to_tensor_cnt = {}
+    MEMO = set()
 
 
 def valid_c_name(name):
@@ -51,10 +67,17 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
     ----------
     sorted_graph : List[Tensor]
         Input graph to be named
+    reset_counters : bool
+        If True, reset counters which are used to name tensors and functions. (Default: False)
     """
     global func_cnt
     global tensor_cnt
     global func_name_to_tensor_cnt
+    global user_provided_dim
+
+    _LOGGER.debug(
+        f"before name_graph: {func_cnt=}, {tensor_cnt=}, {len(func_name_to_tensor_cnt)=}, {len(user_provided_dim)=}"
+    )
     for node in sorted_graph:
         funcs = node.src_ops()
         if len(funcs) == 0:
@@ -63,11 +86,14 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
                 node._attrs["name"] = tensor_name
                 tensor_cnt += 1
                 if isinstance(node, IntVarTensor):
-                    # TODO: emit standalone dynamic shape initialization for IntVarTensor
-                    raise RuntimeError(
-                        "We don't support emitting standalone IntVarTensor at this moment.\n"
-                        f"Encountered {node._attrs['name']}: {node._attrs['int_var']}."
-                    )
+                    if not isinstance(node._attrs["int_var"], IntImm):
+                        # TODO: emit standalone dynamic shape initialization for IntVarTensor
+                        raise RuntimeError(
+                            "We don't support emitting standalone IntVarTensor at this moment.\n"
+                            f"Encountered {node._attrs['name']}: {node._attrs['int_var']}."
+                        )
+                    else:
+                        node._attrs["int_var"]._attrs["name"] = tensor_name
 
         else:
             for func in funcs:
@@ -92,6 +118,89 @@ def name_graph(sorted_graph: List[Tensor]) -> None:
 
         tensor_name = node._attrs["name"]
         for i, dim in enumerate(node._attrs["shape"]):
-            if dim._attrs["name"] is None:
+            if dim._attrs["name"] is not None:
+                user_provided_dim.add(dim._attrs["name"])
+            if dim._attrs["name"] is None and not isinstance(dim, JaggedIntVar):
                 dim_name = "{tname}_dim_{idx}".format(tname=tensor_name, idx=i)
                 dim._attrs["name"] = dim_name
+
+    for tensor in sorted_graph:
+        if tensor.is_jagged():
+            jagged_int_var = tensor._attrs["shape"][0]
+            # JaggedIntVar's name must be the same as the name of the total_length IntVar
+            # that it is based on. Due to the fact that IntVar's _attrs["name"] is accessed
+            # directly throughout the code, we can't enforce this constrain by overloading
+            # the name in the JaggedIntVar class. as a result, we must resort to a hack here
+            # to reset the name of the JaggedIntVar to the name of the total_length after
+            # the latter might have been changed (e.g., from None) by the code above.
+            # TODO (T146653032): wrap _attrs["name"] (and other frequently used _attrs
+            # members) in @properties and override the "name" property in the JaggedIntVar
+            # to return total_length().name.
+            jagged_int_var._attrs["name"] = jagged_int_var.total_length()._attrs["name"]
+
+            batch_dim = jagged_int_var.batch_dim()
+            if batch_dim._attrs["name"] is None:
+                # the batch_dim wasn't named above, so we name it here
+                jagged_int_var_name = jagged_int_var._attrs["name"]
+                batch_dim._attrs["name"] = f"{jagged_int_var_name}_jagged_batch_dim"
+
+    _LOGGER.debug(
+        f"after name_graph: {func_cnt=}, {tensor_cnt=}, {len(func_name_to_tensor_cnt)=}, {len(user_provided_dim)=}"
+    )
+
+
+def dedup_symbolic_name(sorted_graph: List[Tensor]) -> None:
+    """Rename all shape variable that are identical to the same name.
+    Parameters
+    ----------
+    sorted_graph : List[Tensor]
+        Input graph to be simplified
+    """
+    symbolic_to_name = {}
+    global user_provided_dim
+    # First pass - build symbolic_to_name map
+    for i, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        if (
+            dim_sym not in symbolic_to_name
+            or dim_sym in symbolic_to_name
+            and dim._attrs["name"] in user_provided_dim
+        ):
+            symbolic_to_name[dim_sym] = dim._attrs["name"] or f"dim_{i}"
+
+    # Second pass - use symbolic_to_name map
+    for _, dim in _all_dims_in_graph(sorted_graph):
+        if not _dim_qualified_for_sym_dedup(dim):
+            continue
+        dim_sym = dim.symbolic_value()
+        dim._attrs["name"] = symbolic_to_name[dim_sym]
+
+
+def _all_dims_in_graph(sorted_graph: List[Tensor]):
+    dim_idx = 0
+    for node in sorted_graph:
+        for dim in node._attrs["shape"]:
+            yield dim_idx, dim
+            dim_idx += 1
+
+    # In case some dimensions are not encountered in any nodes in the graph,
+    # only in input/output accessors - iterate over all ops and dimensions
+    # in tensor accessors, if any.
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        input_accessors = op._attrs.get("input_accessors", None)
+        output_accessors = op._attrs.get("output_accessors", None)
+        for accessors in (input_accessors, output_accessors):
+            if accessors is None:
+                continue
+            for ta in accessors:
+                if ta.original_shapes:
+                    for dim in ta.original_shapes:
+                        yield dim_idx, dim
+                        dim_idx += 1
+
+
+def _dim_qualified_for_sym_dedup(dim: IntVar) -> bool:
+    return not isinstance(dim, IntImm) and not isinstance(dim, JaggedIntVar)
diff --git a/python/aitemplate/compiler/transform/optimize_graph.py b/python/aitemplate/compiler/transform/optimize_graph.py
index f959221ae..edf0eede5 100644
--- a/python/aitemplate/compiler/transform/optimize_graph.py
+++ b/python/aitemplate/compiler/transform/optimize_graph.py
@@ -15,27 +15,55 @@
 """
 Applies graph transformations.
 """
-
 from typing import List
 
-from ...utils import graph_utils
-from ..base import Tensor
-from .apply_padding import apply_padding
-from .fuse_conv_elementwise import fuse_conv_elementwise
-from .fuse_group_ops import fuse_group_ops
-from .fuse_mm_elementwise import fuse_mm_elementwise
-from .fuse_mm_reshape_permute import fuse_mm_reshape_permute
-from .fuse_ops import fuse_ops
-from .fuse_parallel_gemms import fuse_parallel_gemms
-from .fuse_permute_bmm_and_gemm import fuse_permute_bmm_and_gemm
-from .split_large_concat_ops import split_large_concat_ops
-from .transform_memory_ops import transform_memory_ops
-from .transform_odd_alignment import transform_odd_alignment
-from .transform_special_ops import transform_special_ops
-from .transform_strided_ops import transform_strided_ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.transform.apply_padding import apply_padding
+from aitemplate.compiler.transform.dedup_make_jagged_ops import dedup_make_jagged_ops
+from aitemplate.compiler.transform.fuse_bmm_permute import fuse_bmm_permute
+from aitemplate.compiler.transform.fuse_conv_elementwise import fuse_conv_elementwise
+from aitemplate.compiler.transform.fuse_expand_bmm import fuse_expand_bmm
+from aitemplate.compiler.transform.fuse_group_ops import fuse_group_ops
+from aitemplate.compiler.transform.fuse_mm_elementwise import fuse_mm_elementwise
+from aitemplate.compiler.transform.fuse_mm_reshape_permute import (
+    fuse_mm_reshape_permute,
+)
+from aitemplate.compiler.transform.fuse_ops import (
+    fuse_elementwise,
+    fuse_ops,
+    process_singleton_elementwise,
+)
+from aitemplate.compiler.transform.fuse_parallel_gemms import fuse_parallel_gemms
+from aitemplate.compiler.transform.fuse_permute_bmm_and_gemm import (
+    fuse_permute_bmm_and_gemm,
+)
+from aitemplate.compiler.transform.move_view_ops import move_view_op_before_concat
+from aitemplate.compiler.transform.remove_elementwise_no_ops import (
+    remove_elementwise_no_ops,
+)
+from aitemplate.compiler.transform.split_large_concat_ops import split_large_concat_ops
+from aitemplate.compiler.transform.split_large_slice_scatter_ops import (
+    split_large_slice_scatter_ops,
+)
+from aitemplate.compiler.transform.split_large_split_ops import split_large_split_ops
+from aitemplate.compiler.transform.transform_memory_ops import transform_memory_ops
+from aitemplate.compiler.transform.transform_merge_view_ops import merge_view_ops
+from aitemplate.compiler.transform.transform_odd_alignment import (
+    transform_odd_alignment,
+)
+from aitemplate.compiler.transform.transform_permutations import eliminate_permutations
+from aitemplate.compiler.transform.transform_permute_to_reshape import (
+    transform_permute_to_reshape,
+)
+from aitemplate.compiler.transform.transform_special_ops import transform_special_ops
+from aitemplate.compiler.transform.transform_strided_ops import transform_strided_ops
+
+from aitemplate.utils import graph_utils
 
 
-def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
+def optimize_graph(
+    sorted_graph: List[Tensor], workdir: str, optimize=True
+) -> List[Tensor]:
     """Applies graph optimizations, including
 
     - fuse permute and bmm
@@ -48,6 +76,7 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     - fuse group ops
     - transform special ops
     - transform strided ops
+    - fuse bmm and permute
     - transform memory ops
     - apply padding
 
@@ -65,13 +94,21 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
     """
 
     funcs = [
+        remove_elementwise_no_ops,
+        dedup_make_jagged_ops,
         fuse_permute_bmm_and_gemm,
+        fuse_bmm_permute,
+        fuse_expand_bmm,
         transform_odd_alignment,
         fuse_conv_elementwise,
         fuse_mm_elementwise,
         fuse_mm_reshape_permute,
+        # make sure we run move_view_op_before_concat before transform_memory_ops
+        move_view_op_before_concat,
+        merge_view_ops,
         transform_memory_ops,
         fuse_ops,
+        fuse_elementwise,
         # need to run before transform_strided_ops to fuse strided ops + concat
         # and transform_memory_ops to fuse split + concat
         fuse_parallel_gemms,
@@ -80,13 +117,33 @@ def optimize_graph(sorted_graph: List[Tensor], workdir: str) -> List[Tensor]:
         # op directly. After fuse_ops, there are only FusedElementwise ops.
         transform_special_ops,
         apply_padding,
+        # apply_padding may introduce new concats that can be fused
+        move_view_op_before_concat,
+        transform_memory_ops,
         transform_strided_ops,
+        split_large_slice_scatter_ops,
         split_large_concat_ops,
+        split_large_split_ops,
+        transform_permute_to_reshape,
         transform_memory_ops,
+        eliminate_permutations,
     ]
 
-    for func in funcs:
+    if not optimize:
+        # 1 - Convert elementwise ops to singleton fused_elementwise ops
+        # 2 - Padding also needs to be done for the model to be executable.
+        funcs = [
+            process_singleton_elementwise,
+            apply_padding,
+            split_large_slice_scatter_ops,
+            split_large_concat_ops,
+            split_large_split_ops,
+        ]
+
+    for i, func in enumerate(funcs):
         sorted_graph = func(sorted_graph, workdir)
-        graph_utils.dump_graph_debug_str_to_file(sorted_graph, workdir, func.__name__)
+        graph_utils.dump_graph_debug_str_to_file(
+            sorted_graph, workdir, f"{i:02}-{func.__name__}"
+        )
 
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/profile.py b/python/aitemplate/compiler/transform/profile.py
index 787bed40d..11e4b5a17 100644
--- a/python/aitemplate/compiler/transform/profile.py
+++ b/python/aitemplate/compiler/transform/profile.py
@@ -15,25 +15,29 @@
 """
 Graph pass to invoke profiling.
 """
+import logging
 import os
+from collections import OrderedDict
 from copy import deepcopy
 from datetime import datetime
-from typing import List, OrderedDict
+from typing import List
+
+from aitemplate.backend import builder, codegen
 
 from aitemplate.backend.profiler_runner import ProfilerRunner
 from aitemplate.backend.target import Target
+from aitemplate.compiler.base import DynamicProfileStrategy, Tensor
 
 from aitemplate.compiler.ops.gemm_universal.gemm_common import (
     gemm,
     GemmProfilerPostprocessingDelegate,
 )
+from aitemplate.utils.environ import force_profiler_cache
 
-from aitemplate.utils import logger
+# pylint: disable=C0103,W0613,W0102
 
-from ...backend import builder, codegen
-from ..base import DynamicProfileStrategy, Tensor
 
-# pylint: disable=C0103,W0613,W0102
+_LOGGER = logging.getLogger(__name__)
 
 
 def elapsed_dt_sec(start_t_sec):
@@ -53,8 +57,8 @@ def profile(
     workdir="./tmp",
     devices=None,
     dynamic_profiling_strategy=DynamicProfileStrategy.MAX,
+    timeout=500,
 ):
-
     """Profiles kernels.
 
     Parameters
@@ -76,26 +80,25 @@ def profile(
         devices = [0]
     profiler_dir = os.path.join(workdir)
     start_t = datetime.now()
+    _LOGGER.info(f"Force profiler cache = {force_profiler_cache()}")
     generated_profilers = list(
         codegen.gen_profiler(sorted_graph, profiler_dir, dynamic_profiling_strategy)
     )
     generated_profilers = [p for p in generated_profilers if p is not None]
-    logger.info(
-        __name__,
+    _LOGGER.info(
         f"generated {len(generated_profilers)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
     )
     start_t = datetime.now()
-    compile_engine = builder.Builder()
+    compile_engine = builder.get_compile_engine()
     compile_engine.make_profilers(generated_profilers, profiler_dir)
-    logger.info(__name__, f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
+    _LOGGER.info(f"compiled profilers elapsed time: {elapsed_dt_sec(start_t)}")
     funcs_to_profile = OrderedDict(
-        {
-            func._attrs["name"]: func
-            for node in sorted_graph
-            for func in node.src_ops()
-            if func._attrs["has_profiler"]
-        }
+        (func._attrs["name"], func)
+        for node in sorted_graph
+        for func in node.src_ops()
+        if func._attrs["has_profiler"]
     )
+
     start_t = datetime.now()
     gemms, non_gemms = _splitter(
         funcs_to_profile.values(), lambda f: isinstance(f, gemm)
@@ -104,13 +107,12 @@ def profile(
         f.profile(
             workdir=profiler_dir,
             devices=devices,
-            dynamic_profiling_strategy=dynamic_profiling_strategy,
         )
-    timeout = 360 if Target.current().name() == "rocm" else 180
+    timeout = 2400 if Target.current().name() == "rocm" else 240
     profiler_runner = ProfilerRunner(
         devices,
-        timeout=timeout,
         postprocessing_delegate=GemmProfilerPostprocessingDelegate(),
+        timeout=timeout,
     )
     for f in gemms:
         f.profile(
@@ -118,8 +120,7 @@ def profile(
             profiler_runner=profiler_runner,
         )
     profiler_runner.join()
-    logger.info(
-        __name__,
+    _LOGGER.info(
         f"ran {len(funcs_to_profile)} profilers elapsed time: {elapsed_dt_sec(start_t)}",
     )
     for node in sorted_graph:
diff --git a/python/aitemplate/compiler/transform/profile_dynamic_dim.py b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
index ee08c2716..ecef60721 100644
--- a/python/aitemplate/compiler/transform/profile_dynamic_dim.py
+++ b/python/aitemplate/compiler/transform/profile_dynamic_dim.py
@@ -15,29 +15,31 @@
 """
 Graph pass to invoke profiling with dynamic shapes.
 """
+import logging
+from collections import OrderedDict
 from copy import deepcopy
-from typing import List, OrderedDict
+from typing import List
 
-from ...backend import builder, codegen
-from ...utils import logger
-from ..base import Tensor
+from aitemplate.backend import builder, codegen
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103,W0613,W0102
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def profile_dynamic_dim(sorted_graph: List[Tensor], workdir="./tmp"):
-    logger.info(__name__, "Current dynamic profiler supports ONLY ONE dynamic dim.")
+    _LOGGER.info("Current dynamic profiler supports ONLY ONE dynamic dim.")
     generated_profilers = list(codegen.gen_profiler(sorted_graph, workdir))
     generated_profilers = [p for p in generated_profilers if p is not None]
-    compile_engine = builder.Builder()
+    compile_engine = builder.get_compile_engine()
     compile_engine.make_profilers(generated_profilers, workdir)
     funcs_to_profile = OrderedDict(
-        {
-            func._attrs["name"]: func
-            for node in sorted_graph
-            for func in node.src_ops()
-            if func._attrs["has_profiler"]
-        }
+        (func._attrs["name"], func)
+        for node in sorted_graph
+        for func in node.src_ops()
+        if func._attrs["has_profiler"]
     )
     for f in funcs_to_profile.values():
         f.profile_dynamic_dim(
diff --git a/python/aitemplate/compiler/transform/refine_graph.py b/python/aitemplate/compiler/transform/refine_graph.py
index 6cc44cb2d..c270ee94d 100644
--- a/python/aitemplate/compiler/transform/refine_graph.py
+++ b/python/aitemplate/compiler/transform/refine_graph.py
@@ -15,15 +15,18 @@
 """
 Graph pass to dedup operators with same signatures.
 """
+import logging
 from typing import List
 
-from ...utils import logger
-from ...utils.graph_utils import get_sorted_ops
+from aitemplate.compiler.base import Operator, Tensor
 
-from ..base import Operator, Tensor
+from aitemplate.utils.graph_utils import get_sorted_ops
 
 # pylint: disable=C0103
 
+
+_LOGGER = logging.getLogger(__name__)
+
 SPECIAL_CHECK_FUNC_KEYS = {
     "inputs",
     "name",
@@ -153,7 +156,5 @@ def refine_graph(sorted_graph: List[Tensor]):
         if found:
             refined_ops_set.add(func._attrs["op"])
 
-    logger.debug(__file__, f"refined ops: {refined_ops_set}")
-    logger.info(
-        __file__, f"reduced unique ops from {total_ops} to {total_ops - refined_ops}"
-    )
+    _LOGGER.debug(f"refined ops: {refined_ops_set}")
+    _LOGGER.info(f"reduced unique ops from {total_ops} to {total_ops - refined_ops}")
diff --git a/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py b/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py
new file mode 100644
index 000000000..334611d2a
--- /dev/null
+++ b/python/aitemplate/compiler/transform/remove_elementwise_no_ops.py
@@ -0,0 +1,93 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Eliminate elementwise no-ops (*/1, +-0)
+"""
+from typing import Callable, Dict, List
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.compiler.transform import transform_utils
+
+
+def _is_const_num(tensor: Tensor, val: int) -> bool:
+    return tensor.is_a_const_num() and tensor._attrs["value"] == val
+
+
+def func_add_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][0], 0) or _is_const_num(
+        src_op._attrs["args"][1], 0
+    ):
+        return True
+    return False
+
+
+def func_sub_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][1], 0):
+        return True
+    return False
+
+
+def func_mul_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][0], 1) or _is_const_num(
+        src_op._attrs["args"][1], 1
+    ):
+        return True
+    return False
+
+
+def func_div_predicate(src_op: Tensor) -> bool:
+    if _is_const_num(src_op._attrs["args"][1], 1):
+        return True
+    return False
+
+
+FUNC_TO_PREDICATE_MAP: Dict[FuncEnum, Callable[[Tensor], bool]] = {
+    FuncEnum.ADD: func_add_predicate,
+    FuncEnum.SUB: func_sub_predicate,
+    FuncEnum.MUL: func_mul_predicate,
+    FuncEnum.DIV: func_div_predicate,
+}
+
+
+def remove_elementwise_no_ops(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """elementwise no-ops (*/1, +-0)"""
+    for tensor in sorted_graph:
+
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+
+        if (
+            src_op._attrs["op"] != "elementwise"
+            or src_op._attrs["func"] not in FUNC_TO_PREDICATE_MAP
+            or len(src_op._attrs["args"]) != 2  # Skip legacy usecase
+        ):
+            continue
+
+        predicate = FUNC_TO_PREDICATE_MAP[src_op._attrs["func"]]
+        if not predicate(src_op):
+            continue
+
+        input_tensor = src_op._attrs["inputs"][0]
+        # skip a very special case where ops takes an input and produces an output
+        if tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+            continue
+        transform_utils.remove_single_tensor_op_from_sorted_graph(src_op)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/remove_no_ops.py b/python/aitemplate/compiler/transform/remove_no_ops.py
index 0bf586e40..4fe11e7bb 100644
--- a/python/aitemplate/compiler/transform/remove_no_ops.py
+++ b/python/aitemplate/compiler/transform/remove_no_ops.py
@@ -31,14 +31,182 @@
 """
 from typing import List
 
-from aitemplate.compiler.base import IntVar, Operator
+from aitemplate.compiler.base import IntImm, IntVar, JaggedIntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.expand import ExpandDimensionType
 
 from aitemplate.compiler.transform import transform_utils
 
-from aitemplate.utils import graph_utils
+from aitemplate.utils import graph_utils, shape_utils
 from aitemplate.utils.shape_utils import is_singleton_dimension
 
-from ..base import Tensor
+
+def _remove_id_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """Remove identity ops."""
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "identity":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "identity must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        identity_output = outputs[0]
+        assert len(inputs) == 1, "identity must only have 1 output"
+
+        # skip a very special case where id takes an input and produces an output
+        if identity_output._attrs["is_output"] and inputs[0]._attrs["is_input"]:
+            continue
+
+        transform_utils.remove_single_tensor_op_from_sorted_graph(op)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_concats(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove no-op concats from the graph. A no-op concat is where the output
+    tensor is exactly the same as the input tensor(s) and it isn't the model output.
+    This is the case when:
+    1. There is a single input tensor.
+    2. There is a single non-empty input tensor and the remaining input tensors
+    are empty.
+
+    x = Tensor(shape=[7])
+    empty1 = Tensor(shape=[0], value=[])
+    empty2 = Tensor(shape=[0], value=[])
+
+    y1 = ops.concatenate([x])                   # Case 1
+    y2 = ops.concatenate([empty1])              # Case 1
+    y2 = ops.concatenate([empty1, x, empty2])   # Case 2
+    """
+
+    def is_dim_gt_zero(dim):
+        if isinstance(dim, IntImm):
+            return dim.value() > 0
+        elif isinstance(dim, IntVar):
+            return dim.lower_bound() > 0
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "concatenate":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) >= 1, "concat must have at least 1 input"
+
+        outputs = op._attrs["outputs"]
+        concat_output = outputs[0]
+        assert len(outputs) == 1, "concat must have a single output"
+
+        # Assumes non-empty tensors have non-zero dimensions.
+        # And empty tensors have dimensions of size 0.
+        is_input_non_empty = [
+            all(is_dim_gt_zero(dim) for dim in tensor.shape()) for tensor in inputs
+        ]
+        n_non_empty = sum(is_input_non_empty)
+        if len(inputs) > 1 and n_non_empty > 1 or outputs[0]._attrs["is_output"]:
+            continue
+
+        idx = is_input_non_empty.index(True) if n_non_empty == 1 else 0
+        concat_input = inputs[idx]
+        for dst_op in concat_output.dst_ops():
+            transform_utils.replace_tensor_for_op(dst_op, concat_output, concat_input)
+        transform_utils.remove_tensor_from_sorted_graph(concat_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_dynamic_slices(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove any no-op slices from the graph. A no-op slice is when the input tensor
+    and output tensor are exactly the same. This happens when the start indices
+    and end indices cover the entire dimension length.
+
+    x = Tensor([1, 2, 3])
+    y = x[:]
+
+    xx = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+    yy = xx[0:2, -4:4]
+    """
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "dynamic_slice":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "dynamic_slice must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        assert len(inputs) == 1, "dynamic_slice must only have 1 output"
+
+        slice_input, slice_output = inputs[0], outputs[0]
+        if (
+            not shape_utils.is_same_shape(slice_input.shape(), slice_output.shape())
+            or slice_output._attrs["is_output"]
+        ):
+            continue
+
+        for dst_op in slice_output.dst_ops():
+            transform_utils.replace_tensor_for_op(dst_op, slice_output, slice_input)
+        transform_utils.remove_tensor_from_sorted_graph(slice_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _remove_no_op_splits(sorted_graph: List[Tensor]) -> List[Tensor]:
+    """
+    Remove any no-op split from the graph where the input tensor is non-jagged.
+    A no-op split is where the input tensor isn't divided into multiple parts.
+    This happens when the split_size_or_sections argument is:
+    1. an integer representing the length of the dimension indicated by dim
+    2. a singleton list containing the length of the dimension indicated by dim.
+
+    x = Tensor([1, 2, 3])
+    y1 = split(x, split_size_or_sections=3, dim=0)  # Case 1
+    y2 = split(x, split_size_or_sections=[3], dim=0)   # Case 2
+
+    xx = Tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
+    yy1 = split(xx, split_size_or_sections=2, dim=0)  # Case 1
+    yy2 = split(xx, split_size_or_sections=4, dim=1)  # Case 1
+    yy3 = split(xx, split_size_or_sections=[2], dim=0)  # Case 2
+    yy4 = split(xx, split_size_or_sections=[4], dim=1)  # Case 2
+    """
+
+    ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in ops:
+        if op._attrs["op"] != "split":
+            continue
+
+        inputs = op._attrs["inputs"]
+        assert len(inputs) == 1, "split must only have 1 input"
+
+        outputs = op._attrs["outputs"]
+        assert len(inputs) >= 1, "split must have at least 1 output"
+
+        split_dim = op._attrs["split_dim"]
+        split_input, split_output = inputs[0], outputs[0]
+        input_split_dim_len, output_split_dim_len = (
+            split_input._attrs["shape"][split_dim],
+            split_output._attrs["shape"][split_dim],
+        )
+
+        # No-op splits must have one output, and the input and output shapes
+        # must match along split_dim. We ignore no-op splits that are outputs.
+        if (
+            len(outputs) > 1
+            or input_split_dim_len != output_split_dim_len
+            or outputs[0]._attrs["is_output"]
+        ):
+            continue
+
+        # Delete the split output in the graph.
+        for dst_op in list(split_output.dst_ops()):
+            transform_utils.replace_tensor_for_op(dst_op, split_output, split_input)
+
+        transform_utils.remove_tensor_from_sorted_graph(split_output)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
 def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
@@ -56,9 +224,6 @@ def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
         if op._attrs["op"] != "expand":
             continue
 
-        if op._attrs["expand_dim"] is not None:
-            continue
-
         outputs = op._attrs["outputs"]
         assert len(outputs) == 1, "expand must only have 1 output"
         expand_output = outputs[0]
@@ -70,6 +235,14 @@ def _remove_no_op_expands(sorted_graph: List[Tensor]) -> List[Tensor]:
         assert len(inputs) >= 1, "expand must have at least 1 input"
         expand_input = inputs[0]
 
+        assert len(op._attrs["dim_types"]) == len(
+            expand_output._attrs["shape"]
+        ), "expand must have dim_type for every output dimension"
+
+        # If we just keep every dimension as-is, it is a no-op
+        if any(dt != ExpandDimensionType.KEEP_DIM for dt in op._attrs["dim_types"]):
+            continue
+
         # This expand is a no-op, so we know that these shapes should
         # be the same. However, the shape inference system may not be aware
         # of that due to different IntVar names.
@@ -106,6 +279,15 @@ def _is_compatible_with_broadcasting(
             expand_output_dim
         )
 
+    def _replace_jagged_int_var(shape: List[IntVar]):
+        """
+        If shape[0] is a JaggedIntVar, replace it with
+        the corresponding maximum dense shape.
+        """
+        if shape and isinstance(shape[0], JaggedIntVar):
+            return shape[0].get_max_dense_shape() + shape[1:]
+        return shape
+
     for op in graph_utils.get_sorted_ops(sorted_graph):
         if op._attrs["op"] != "expand":
             continue
@@ -117,6 +299,8 @@ def _is_compatible_with_broadcasting(
         if expand_output._attrs["is_output"]:
             continue
 
+        expand_output_shape = _replace_jagged_int_var(expand_output._attrs["shape"])
+
         def _can_fuse_with(dst_op: Operator) -> bool:
             if dst_op._attrs["op"] != "elementwise":
                 return False
@@ -124,10 +308,16 @@ def _can_fuse_with(dst_op: Operator) -> bool:
             for elementwise_input in dst_op._attrs["inputs"]:
                 if elementwise_input is expand_output:
                     continue
+
+                elementwise_input_shape = _replace_jagged_int_var(
+                    elementwise_input._attrs["shape"]
+                )
+
                 if not all(
                     _is_compatible_with_broadcasting(dim_a, dim_b)
                     for dim_a, dim_b in zip(
-                        expand_output._attrs["shape"], elementwise_input._attrs["shape"]
+                        expand_output_shape,
+                        elementwise_input_shape,
                     )
                 ):
                     return False
@@ -160,6 +350,10 @@ def remove_no_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
         Graph after remove no-ops
     """
     passes = [
+        _remove_id_ops,
+        _remove_no_op_concats,
+        _remove_no_op_dynamic_slices,
+        _remove_no_op_splits,
         _remove_no_op_expands,
         _fuse_expand_elementwise,
     ]
diff --git a/python/aitemplate/compiler/transform/remove_unused_ops.py b/python/aitemplate/compiler/transform/remove_unused_ops.py
index f3ccee282..26675a0b7 100644
--- a/python/aitemplate/compiler/transform/remove_unused_ops.py
+++ b/python/aitemplate/compiler/transform/remove_unused_ops.py
@@ -18,7 +18,7 @@
 from collections import deque
 from typing import List
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
 def remove_unused_ops(sorted_graph: List[Tensor]) -> None:
diff --git a/python/aitemplate/compiler/transform/split_large_concat_ops.py b/python/aitemplate/compiler/transform/split_large_concat_ops.py
index d2c6ee1ab..c4c4b55c8 100644
--- a/python/aitemplate/compiler/transform/split_large_concat_ops.py
+++ b/python/aitemplate/compiler/transform/split_large_concat_ops.py
@@ -22,14 +22,16 @@
 
 from typing import List
 
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import graph_utils
 
-from ...utils import graph_utils
-from .. import ops
-from ..base import Operator, Tensor
-from . import transform_utils
 
-logger = logging.getLogger(__name__)
+_LOGGER = logging.getLogger(__name__)
 
 CONCAT_INPUT_META_SIZE = 64
 CONCAT_OUTPUT_META_SIZE = 16
@@ -46,7 +48,7 @@ def _concat_kernel_single_input_output_param_size(op: Operator):
     size_of_one_output_meta = CONCAT_OUTPUT_META_SIZE * rank
     # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
     total_params_size = CONCAT_INPUT_META_SIZE + size_of_one_output_meta + 24
-    logger.debug(f'concat op op._attrs["name"]: {total_params_size=}')
+    _LOGGER.debug(f'concat op {op._attrs["name"]}: {total_params_size=}')
     return total_params_size
 
 
@@ -91,22 +93,21 @@ def split_large_concat_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
         concat_outputs = concat_op._attrs["outputs"]
         input_accessors = concat_op._attrs["input_accessors"]
         for new_inputs_size in split_sizes:
-            new_concat_output = ops.concatenate()(
-                concat_inputs, concat_op._attrs["concat_dim"]
-            )
-            new_concat_op = list(new_concat_output.src_ops())[0]
+            new_concat_op = ops.concatenate()
+            new_concat_op._attrs["inputs"] = list(concat_inputs)
+            new_concat_op._attrs["concat_dim"] = concat_op._attrs["concat_dim"]
             new_concat_op._attrs["outputs"] = concat_outputs.copy()
             new_concat_op._attrs["original_inputs"] = concat_op._attrs[
                 "original_inputs"
             ].copy()
             new_concat_op._attrs["input_masks"] = concat_op._attrs["input_masks"].copy()
             new_concat_op._attrs["input_accessors"] = copy.deepcopy(input_accessors)
+            new_concat_op._set_depth()
+
             indices_to_remove = list(range(offset)) + list(
                 range(offset + new_inputs_size, num_inputs)
             )
             new_concat_op.remove_input_at(indices_to_remove)
-            new_concat_output._attrs["src_ops"] = StableSet()
-            new_concat_output._attrs["dst_ops"] = StableSet()
             all_new_concat_ops.append(new_concat_op)
             offset += new_inputs_size
         # original inputs are distributed among new concats, so we need to adjust
diff --git a/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
new file mode 100644
index 000000000..f3e1761f5
--- /dev/null
+++ b/python/aitemplate/compiler/transform/split_large_slice_scatter_ops.py
@@ -0,0 +1,164 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This transformation splits a slice_scatter or slice_reshape_scatter with a large
+number of inputs into multiple slice_scatter or slice_reshape_scatter ops.
+"""
+import copy
+import logging
+
+from typing import List
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import graph_utils, shape_utils
+
+
+_LOGGER = logging.getLogger(__name__)
+
+# slice_scatter and slice_reshape_scatter use the same kernel implementation
+SLICE_SCATTER_INPUT_META_SIZE = 64  # bytes per input
+SLICE_SCATTER_OUTPUT_META_SIZE = 16  # bytes per rank
+MAX_CUDA_PARAM_BYTES = 4096  # bytes
+
+
+def _slice_scatter_kernel_single_input_output_param_size(op: Operator):
+    """
+    Return the total size (in bytes) of the slice_scatter's params.
+    We need to adjust this if we change its params.
+    """
+    inputs = op._attrs["inputs"]
+    rank = inputs[0]._rank()
+    size_of_output_meta = SLICE_SCATTER_OUTPUT_META_SIZE * rank
+    # There are one more params, which takes 8 bytes.
+    total_params_size = SLICE_SCATTER_INPUT_META_SIZE + size_of_output_meta + 8
+    _LOGGER.debug(f'slice_scatter op {op._attrs["name"]}: {total_params_size=}')
+    return total_params_size
+
+
+def split_large_slice_scatter_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Our slice_scatter CUDA kernel takes an input meta argument whose size
+    is proportional to the number of inputs. In extreme cases, the total size
+    of the kernel function params may exceed the limit imposed by the CUDA
+    compiler. In such cases, we split the slice_scatter op into separate
+    ones, each of which takes the original output and inputs with correct
+    input_masks values.
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if op._attrs["op"] not in ["slice_reshape_scatter", "slice_scatter"]:
+            continue
+        slice_scatter_op = op
+        # We create InputMeta for inputs that need to copy data.
+        inputs = slice_scatter_op._attrs["inputs"]
+        num_inputs = len(inputs)
+        if num_inputs == 0:
+            continue
+        params_size = _slice_scatter_kernel_single_input_output_param_size(
+            slice_scatter_op
+        )
+        if params_size > MAX_CUDA_PARAM_BYTES:
+            raise RuntimeError(
+                f"cannot handle cases: {params_size=} > {MAX_CUDA_PARAM_BYTES=}"
+            )
+        total_params_size = params_size * num_inputs
+        if total_params_size <= MAX_CUDA_PARAM_BYTES:
+            continue
+        num_inputs_per_split = MAX_CUDA_PARAM_BYTES // params_size
+        num_splits = (num_inputs + num_inputs_per_split - 1) // num_inputs_per_split
+        split_sizes = [num_inputs_per_split] * num_splits
+        if num_inputs % num_inputs_per_split:
+            split_sizes[num_splits - 1] = num_inputs % num_inputs_per_split
+
+        inputs_offset = 0
+        all_new_slice_scatter_ops = []
+        outputs = slice_scatter_op._attrs["outputs"]
+        output_accessors = slice_scatter_op._attrs["output_accessors"]
+        scatter_dim = slice_scatter_op._attrs["scatter_dim"]
+        has_profiler = slice_scatter_op._attrs["has_profiler"]
+        local_output_offset = 0
+        orig_name = slice_scatter_op._attrs["name"]
+        slice_ops = slice_scatter_op._attrs["slice_ops"]
+        for split_idx, new_inputs_size in enumerate(split_sizes):
+            if op._attrs["op"] == "slice_scatter":
+                new_slice_scatter_op = ops.slice_scatter(scatter_dim)
+            elif op._attrs["op"] == "slice_reshape_scatter":
+                new_slice_scatter_op = ops.slice_reshape_scatter(
+                    scatter_dim, slice_scatter_op._attrs["element_func"]
+                )
+            new_name = f"{orig_name}_split_{split_idx}"
+            new_slice_scatter_op._attrs["name"] = new_name
+            new_slice_scatter_op._attrs["original_name"] = new_name
+            new_slice_scatter_op._attrs["has_profiler"] = has_profiler
+            new_slice_scatter_op._attrs["outputs"] = outputs
+            new_slice_scatter_op._attrs["output_accessors"] = copy.deepcopy(
+                output_accessors
+            )
+            new_slice_scatter_op._set_depth()
+
+            # import pdb; pdb.set_trace()
+            new_inputs = list(inputs[inputs_offset : (inputs_offset + new_inputs_size)])
+            new_slice_scatter_op._attrs["inputs"] = new_inputs
+            new_slice_ops = slice_ops[inputs_offset : (inputs_offset + new_inputs_size)]
+            new_slice_scatter_op._attrs["slice_ops"] = new_slice_ops
+
+            # We also need to update the offset of the output tensor accessor.
+            # Note that the strided information remains the same because the output
+            # remains the same and we just shift the head offset for each new
+            # slice scatter op.
+            new_slice_scatter_op._attrs["output_accessors"][
+                0
+            ].offset += local_output_offset
+            for input_tensor, slice_op in zip(new_inputs, new_slice_ops):
+                input_tensor_shape = input_tensor._attrs["shape"]
+                # This is enforced by slice_scatter op. Just ensure we didn't
+                # violate the assumption somewhere.
+                assert shape_utils.all_static_dimensions(
+                    input_tensor_shape, scatter_dim
+                ), (
+                    f"Expected input_tensor_shape[{scatter_dim}:] are all static dimensions, "
+                    f"but got: {input_tensor_shape}"
+                )
+                start_indices = slice_op._attrs["start_indices"]
+                end_indices = slice_op._attrs["end_indices"]
+                strided_dim_offset = 1
+                for dim, start, end in zip(
+                    input_tensor_shape[scatter_dim:],
+                    start_indices[scatter_dim:],
+                    end_indices[scatter_dim:],
+                ):
+                    n_start, n_end = dynamic_slice.normalize_start_end_indices(
+                        dim.value(), start, end
+                    )
+                    assert n_start <= n_end, (
+                        f"expected normalized {n_start=} <= {n_end=} for "
+                        f"{dim=}, {start=}, {end=}"
+                    )
+                    strided_dim_offset *= n_end - n_start
+                local_output_offset += strided_dim_offset
+                input_tensor._attrs["dst_ops"].update([new_slice_scatter_op])
+                input_tensor._attrs["dst_ops"].discard(slice_scatter_op)
+            all_new_slice_scatter_ops.append(new_slice_scatter_op)
+            inputs_offset += new_inputs_size
+        output = outputs[0]
+        output._attrs["src_ops"].update(all_new_slice_scatter_ops)
+        output._attrs["src_ops"].remove(slice_scatter_op)
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/split_large_split_ops.py b/python/aitemplate/compiler/transform/split_large_split_ops.py
new file mode 100644
index 000000000..beab11ca6
--- /dev/null
+++ b/python/aitemplate/compiler/transform/split_large_split_ops.py
@@ -0,0 +1,114 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This transformation splits a split with a large number of outputs into multiple
+splitt ops, which share the same input with correct output_masks.
+"""
+import logging
+
+from typing import List
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+
+from aitemplate.compiler.transform import toposort, transform_utils
+
+from aitemplate.utils import graph_utils
+
+
+_LOGGER = logging.getLogger(__name__)
+
+SPLIT_INPUT_META_SIZE = 16
+SPLIT_OUTPUT_META_SIZE = 32
+MAX_CUDA_PARAM_BYTES = 4096
+
+
+def _split_kernel_single_input_output_param_size(op: Operator):
+    """
+    Return the total size (in bytes) of the split's params.
+    We need to adjust this if we change the split op's params.
+    Note this is conservative by multiplying input_meta and constant 24 bytes.
+    """
+    outputs = op._attrs["outputs"]
+    rank = outputs[0]._rank()
+    size_of_input_meta = SPLIT_INPUT_META_SIZE * rank
+    # There are 3 more params, where each takes 8 bytes, so we add 24 more bytes
+    total_params_size = SPLIT_OUTPUT_META_SIZE + size_of_input_meta + 24
+    _LOGGER.debug(f'split op op._attrs["name"]: {total_params_size=}')
+    return total_params_size
+
+
+def split_large_split_ops(sorted_graph: List[Tensor], _: str) -> List[Tensor]:
+    """
+    Our split CUDA kernel takes an output meta argument whose size
+    is proportional to the number of outputs. In extreme cases, the total size
+    of the params of a split kernel may exceed the limit imposed by the CUDA
+    compiler. In such cases, we split the split op into separate ones.
+    """
+    modified = False
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+    for op in sorted_ops:
+        if not op._attrs["op"].startswith("split"):
+            continue
+        split_op = op
+
+        split_params_size = _split_kernel_single_input_output_param_size(split_op)
+        if split_params_size > MAX_CUDA_PARAM_BYTES:
+            raise RuntimeError(
+                f"cannot handle cases: {split_params_size=} > {MAX_CUDA_PARAM_BYTES=}"
+            )
+        if split_params_size * len(split_op._attrs["outputs"]) <= MAX_CUDA_PARAM_BYTES:
+            continue
+
+        modified = True
+        split_dim = split_op._attrs["split_dim"]
+        split_sizes = split_op._attrs["split_sizes"]
+        outputs = split_op._attrs["outputs"]
+        num_outputs_per_split = MAX_CUDA_PARAM_BYTES // split_params_size
+        # compute how many split ops we need to fix within MAX_CUDA_PARAM_BYTES
+        num_split_ops = (
+            len(outputs) + num_outputs_per_split - 1
+        ) // num_outputs_per_split
+
+        output_mapping = []
+        for split_i in range(num_split_ops):
+            start = split_i * num_outputs_per_split
+            end = min(
+                (split_i + 1) * num_outputs_per_split, len(split_op._attrs["outputs"])
+            )
+
+            remove_indices = list(range(start)) + list(
+                range(end, len(split_op._attrs["outputs"]))
+            )
+            new_split = ops.split()
+            new_outputs = new_split(
+                split_op._attrs["inputs"][0], split_sizes, split_dim
+            )
+            new_split.remove_output_at(remove_indices)
+            new_outputs = new_split._attrs["outputs"]
+            sorted_graph += list(new_outputs)
+            output_mapping += list(zip(outputs[start:end], new_outputs))
+
+        for old_output, new_output in output_mapping:
+            transform_utils.replace_tensor(old_output, new_output)
+
+    if not modified:
+        return sorted_graph
+
+    new_output_tensors = [
+        tensor for tensor in sorted_graph if tensor._attrs["is_output"]
+    ]
+    sorted_graph = toposort.toposort(new_output_tensors)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/toposort.py b/python/aitemplate/compiler/transform/toposort.py
index 9de7eb615..8fb6411e8 100644
--- a/python/aitemplate/compiler/transform/toposort.py
+++ b/python/aitemplate/compiler/transform/toposort.py
@@ -15,9 +15,10 @@
 """
 Graph pass for topological sort.
 """
-from typing import List, Union
+import heapq
+from typing import List, Tuple, Union
 
-from ..base import Tensor
+from aitemplate.compiler.base import Tensor
 
 # pylint: disable=C0103
 
@@ -35,31 +36,117 @@ def toposort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
     List[Tensor]
         Sorted graph
     """
+    return _priSort(nodes, SizePriTensorHelper())
+
+
+def _dfsSort(nodes: Union[Tensor, List[Tensor]]) -> List[Tensor]:
     visited = set()
     sorted_graph = []
+    stack = []
+
+    if isinstance(nodes, Tensor):
+        stack.append((nodes, False))
+    else:
+        for node in list(nodes)[::-1]:
+            stack.append((node, False))
+
+    while len(stack) > 0:
+        curr_node, curr_visited = stack.pop()
+        if curr_visited:
+            sorted_graph.append(curr_node)
+            for src_op in curr_node.src_ops():
+                for next_node in src_op._attrs["outputs"]:
+                    stack.append((next_node, False))
+            continue
+        if curr_node in visited:
+            continue
 
-    def DFS(nd: Tensor):
-        if nd in visited:
-            return
-        for src_op in nd.src_ops():
+        visited.add(curr_node)
+        stack.append((curr_node, True))
+        for src_op in curr_node.src_ops():
             args = src_op._attrs["inputs"]
             indexed_args = list(enumerate(args))
             depth_first_args = sorted(
                 indexed_args, key=lambda x: x[1]._attrs["depth"], reverse=True
             )
-            visit_seq = [x[0] for x in depth_first_args]
+            visit_seq = [x[0] for x in depth_first_args[::-1]]
             for idx in visit_seq:
                 arg = args[idx]
-                DFS(arg)
-        visited.add(nd)
-        sorted_graph.append(nd)
-        for src_op in nd.src_ops():
-            for next_nd in src_op._attrs["outputs"]:
-                DFS(next_nd)
+                stack.append((arg, False))
+    return sorted_graph
 
-    if isinstance(nodes, Tensor):
-        DFS(nodes)
-    else:
-        for node in list(nodes):
-            DFS(node)
+
+class PriTensorHelper:
+    def __init__(self) -> None:
+        self.entry_cnt = -1
+
+    def get_heap_input(self, node: Tensor) -> Tuple[float, int, Tensor]:
+        # input is built based on heapq doc suggestion:
+        # https://docs.python.org/3/library/heapq.html#priority-queue-implementation-notes
+        # the return tuple is: (
+        #   priority_ (less is more important),
+        #   entry_cnt (so earlier entered item is chosen if same priority),
+        #   element (here is tensor)
+        # )
+        self.entry_cnt += 1
+        return (
+            self.get_priority(node),
+            self.entry_cnt,
+            node,
+        )
+
+    def get_tensor_from_heap_output(
+        self, heap_output: Tuple[float, int, Tensor]
+    ) -> Tensor:
+        return heap_output[2]
+
+    def get_priority(self, node: Tensor) -> float:
+        # please implement your own priority function
+        # note that smaller value would be in higher-pri
+        pass
+
+
+class SizePriTensorHelper(PriTensorHelper):
+    def get_priority(self, node: Tensor) -> float:
+        # use negative byte size since
+        # we'd like to pop larger size first
+        return -node.size_bytes()
+
+
+def _priSort(
+    nodes: Union[Tensor, List[Tensor]], pri_tensor_helper: PriTensorHelper
+) -> List[Tensor]:
+    # do a DFS to get all nodes in a list
+    nodes = _dfsSort(nodes)
+    # number of src tensors
+    in_degree = {}
+    for node in nodes:
+        in_degree[node] = 0
+        for src_op in node.src_ops():
+            # sometimes it'd have 2 same nodes in one list
+            # change to set to de-dupe these nodes
+            in_degree[node] += len(set(src_op._attrs["inputs"]))
+
+    queue = []
+    sorted_graph = []
+    for node in nodes:
+        if in_degree[node] == 0:
+            # input nodes need to be in the original order,
+            # hence add them to the sorted graph here
+            # instead of going through the pri heap
+            sorted_graph.append(node)
+            heapq.heappush(queue, pri_tensor_helper.get_heap_input(node))
+
+    while queue:
+        node = pri_tensor_helper.get_tensor_from_heap_output(heapq.heappop(queue))
+        if node not in sorted_graph:
+            sorted_graph.append(node)
+
+        for dst_op in node.dst_ops():
+            for next_node in set(dst_op._attrs["outputs"]):
+                if next_node not in in_degree:
+                    continue
+                in_degree[next_node] -= 1
+                if in_degree[next_node] == 0:
+                    heapq.heappush(queue, pri_tensor_helper.get_heap_input(next_node))
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_memory_ops.py b/python/aitemplate/compiler/transform/transform_memory_ops.py
index d6de21e7f..0a42f8d36 100644
--- a/python/aitemplate/compiler/transform/transform_memory_ops.py
+++ b/python/aitemplate/compiler/transform/transform_memory_ops.py
@@ -15,13 +15,18 @@
 """
 Perform memory operator related transformations.
 """
+import copy
 from typing import List
 
+from aitemplate.compiler.base import Operator, Tensor
+
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice
 from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_merge_slice_ops import merge_slice_ops
 
-from ...utils import graph_utils
-from ..base import Operator, Tensor
-from . import transform_utils
+from aitemplate.utils import graph_utils, shape_utils
 
 
 def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
@@ -45,11 +50,115 @@ def _eliminate_cat(sorted_graph: List[Tensor]) -> List[Tensor]:
             single_input_cat_ops.append(op)
 
     for op in single_input_cat_ops:
+        input_tensor = op._attrs["inputs"][0]
+        output_tensor = op._attrs["outputs"][0]
+        # tensor can not be input and output
+        if output_tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+            continue
         transform_utils.remove_single_tensor_op_from_sorted_graph(op)
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
-def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
+def _update_cat_dst_ops(
+    first_cat: Operator, second_cat: Operator, cat_dim_offset: int
+) -> None:
+    """
+    Add all the strided dst_ops of the first cat to the second and
+    make an appropriate slice op between the second cat and each dst_ops.
+    cat_dim_offset represents the offset of the first cat output appearing
+    in the second cat along the cat_dim dimension.
+    """
+    first_cat_output = first_cat._attrs["outputs"][0]
+    first_cat_dst_ops = first_cat_output._attrs["dst_ops"]
+    # the first cat does not have any strided ops
+    if len(first_cat_dst_ops) <= 1:
+        return
+    first_cat_shape = first_cat_output.shape()
+    rank = len(first_cat_shape)
+    cat_dim = first_cat._attrs["concat_dim"]
+    assert transform_strided_ops_utils.cat_split_dim_is_static(
+        first_cat, cat_dim
+    ), f"expected the {cat_dim=} of {first_cat=} to be static"
+    second_cat_output = second_cat._attrs["outputs"][0]
+    # make start_indices and end_indices for the slice
+    for idx, first_cat_dst_op in enumerate(first_cat_dst_ops):
+        if first_cat_dst_op is second_cat:
+            continue
+        else:
+            # Make a new slice op. Note that it's fine we make a new slice op from
+            # another slice op, because consecutive slice ops will be merged
+            # by the merge_slice_ops pass
+            slice_start_indices = [0] * rank
+            slice_end_indices = [None] * rank
+            slice_start_indices[cat_dim] = cat_dim_offset
+            slice_end_indices[cat_dim] = (
+                cat_dim_offset + first_cat_shape[cat_dim].value()
+            )
+            slice_op = dynamic_slice()
+            slice_op_name = f'dynamic_slice_{idx}_{first_cat._attrs["name"]}'
+            slice_op._attrs["name"] = slice_op_name
+            slice_op._attrs["original_name"] = slice_op_name
+            slice_output = slice_op(
+                second_cat_output, slice_start_indices, slice_end_indices
+            )
+            slice_output._attrs["name"] = f"{slice_op_name}_0"
+            slice_output._attrs["dst_ops"].add(first_cat_dst_op)
+            # remove the old strided op from first cat's dst_ops
+            first_cat_dst_ops.remove(first_cat_dst_op)
+            # update the strided op's input to the newly-created slice output
+            first_cat_dst_op.replace_input_tensor(first_cat_output, slice_output)
+
+
+def _is_supported_dst_op_for_first_cat(
+    dst_op: Operator,
+) -> bool:
+    """
+    A helper function that returns True if the given dst_op is
+    * a supported strided op; or
+    * a view op that is only used by a supported stride op; or
+    * a view op that is indirectly (via another single-dst view op) used
+      by a supported strided op.
+    Note that technically, this checking is not necessary, because we could
+    let other passes process the likely fusion patterns related to
+    concat + strided_op. However, it seems to be safer if we could add
+    more tests similar to test_fuse_strided_cat_reshape_cat but with different
+    strided ops such as gemm/layernorm/etc. To be conservative, we only
+    enable the following patterns and will remove the restriction once we
+    have more test coverage.
+    """
+    view_ops = ["reshape", "flatten", "dynamic_slice", "squeeze", "unsqueeze"]
+    # FIXME: enable other ops with input_accessors
+    supported_strided_ops = ["elementwise", "fused_elementwise"]
+
+    def _supported_op_type(op_type):
+        if op_type in supported_strided_ops:
+            return True
+        return op_type.startswith(("bmm_crr", "bmm_rrr"))
+
+    dst_op_type = dst_op._attrs["op"]
+    if _supported_op_type(dst_op_type):
+        return True
+    while dst_op_type in view_ops:
+        dst_op_outputs = dst_op._attrs["outputs"]
+        if len(dst_op_outputs) != 1:
+            return False
+        dst_op_output = dst_op_outputs[0]
+        if dst_op_output._attrs["is_output"]:
+            return False
+        next_dst_ops = dst_op_output._attrs["dst_ops"]
+        if len(next_dst_ops) != 1:
+            return False
+        dst_op = next_dst_ops[0]
+        dst_op_type = dst_op._attrs["op"]
+        if _supported_op_type(dst_op_type):
+            return True
+    return False
+
+
+def _check_first_cat(first_cat: Operator, second_cat: Operator) -> bool:
+    """
+    return True if the first cat is valid for fusion
+    """
     # Make sure input_accessors do not carry any strided information.
     # It may happen. For example, an input of the cat can be of a strided
     # tensor generated by slice, which takes another concat's output.
@@ -59,31 +168,159 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     #     y = cat(y1, y2)
     # In such a case, we cannot merge those two concat ops.
     if not all(
-        accessor.stride_dim is None for accessor in cat._attrs["input_accessors"]
+        accessor.actual_shapes is None
+        for accessor in first_cat._attrs["input_accessors"]
     ):
         return False
-    first_op_inputs = first_op._attrs["inputs"]
-    first_op_outputs = first_op._attrs["outputs"]
+    if not all(first_cat._attrs["input_masks"]):
+        return False
+
+    # we need to make sure all other dst ops except the second cat have input
+    # accessors for which we may generate valid strided information. We will
+    # leverage the input accessor by injecting a slice op between the merged
+    # cat and the strided op (e.g. add).
+    cat_dim = first_cat._attrs["concat_dim"]
+    first_cat_outputs = first_cat._attrs["outputs"]
+    assert (
+        len(first_cat_outputs) == 1
+    ), f"expected {first_cat_outputs=} to have a single output"
+    first_cat_output = first_cat_outputs[0]
+    first_cat_dst_ops = first_cat_output._attrs["dst_ops"]
+    if len(first_cat_dst_ops) == 1:
+        return True
+    if not transform_strided_ops_utils.cat_split_dim_is_static(first_cat, cat_dim):
+        return False
+    # we cannot leverage slice if any of the dimensions after cat_dim is dynamic
+    if not shape_utils.all_static_dimensions(first_cat_output.shape(), cat_dim):
+        return False
+
+    # we can fuse the first cat into the second only if all of the first cat's
+    # dst ops are valid
+    for dst_op in first_cat_dst_ops:
+        if dst_op is second_cat:
+            continue
+        if not _is_supported_dst_op_for_first_cat(dst_op):
+            return False
+        # merging first_cat and second_cat may introduce a cycle
+        if transform_utils.is_ancestor(dst_op, second_cat):
+            return False
+    return True
+
+
+def _check_second_cat(cat: Operator) -> bool:
+    """
+    return True if the second cat is valid for fusion
+    """
+    if len(cat._attrs["outputs"]) != 1:
+        return False
+    # Similar to the first cat, make sure the second cat's input_accessors
+    # do not carry any strided information.
+    if not all(
+        accessor.actual_shapes is None for accessor in cat._attrs["input_accessors"]
+    ):
+        return False
+    if not all(cat._attrs["input_masks"]):
+        return False
+    return True
+
+
+def _try_merge_cat_cat(first_cat: Operator, second_cat: Operator) -> bool:
+    if not _check_first_cat(first_cat, second_cat):
+        return False
+    if not _check_second_cat(second_cat):
+        return False
+    first_cat_inputs = first_cat._attrs["inputs"]
+    first_cat_outputs = first_cat._attrs["outputs"]
+    first_cat_output = first_cat_outputs[0]
+    second_cat_inputs = second_cat._attrs["inputs"]
+    second_cat_original_inputs = second_cat._attrs["original_inputs"]
+    new_cat_inputs = []
+    new_cat_original_inputs = []
+    new_cat_input_accessors = []
+    for i, second_cat_input in enumerate(second_cat_inputs):
+        if second_cat_input is first_cat_output:
+            new_cat_inputs.extend(first_cat._attrs["inputs"])
+            first_cat_original_inputs = first_cat._attrs["inputs"]
+            new_cat_original_inputs.extend(first_cat_original_inputs)
+            new_cat_input_accessors.extend(
+                copy.deepcopy(first_cat._attrs["input_accessors"])
+            )
+        else:
+            new_cat_inputs.append(second_cat_input)
+            new_cat_original_inputs.append(second_cat_original_inputs[i])
+            new_cat_input_accessors.append(second_cat._attrs["input_accessors"][i])
+
+    for tensor in new_cat_inputs:
+        if tensor in first_cat_outputs:
+            return False
+
+    # note that we have to compute cat_dim_offset before updating cat's inputs,
+    # because we determine the cat_dim_offset based on its old inputs
+    cat_dim_offset = 0
+    cat_dim = second_cat._attrs["concat_dim"]
+    for second_cat_input in second_cat._attrs["inputs"]:
+        if second_cat_input is first_cat_output:
+            break
+        cat_dim_offset += second_cat_input._size(cat_dim).value()
+
+    second_cat._attrs["inputs"] = new_cat_inputs
+    # make sure all of the input_masks values are True. We may need to
+    # change this part later when we have TensorAccessors, depending on
+    # the order of the transformations.
+    assert all(second_cat._attrs["input_masks"])
+    second_cat._attrs["input_accessors"] = new_cat_input_accessors
+    second_cat._attrs["original_inputs"] = list(new_cat_original_inputs)
+    second_cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
+    for tensor in first_cat_inputs:
+        # the same tensor may be used multiple times
+        tensor._attrs["dst_ops"].discard(first_cat)
+        tensor._attrs["dst_ops"].add(second_cat)
+    # now we can move strided ops from the first cat to the merged cat with
+    # an appropriate slice op between the merged cat and each strided op
+    _update_cat_dst_ops(first_cat, second_cat, cat_dim_offset)
+    transform_utils.remove_tensor_from_sorted_graph(first_cat_output)
+    return True
+
+
+def _try_merge_split_cat(split_op: Operator, cat: Operator) -> bool:
+    # If split_op carries strided input_accessors, we skip it.
+    if not all(
+        accessor.actual_shapes is None for accessor in cat._attrs["input_accessors"]
+    ):
+        return False
+    if not all(cat._attrs["input_masks"]):
+        return False
+    split_op_inputs = split_op._attrs["inputs"]
+    split_op_outputs = split_op._attrs["outputs"]
     cat_inputs = cat._attrs["inputs"]
+    cat_original_inputs = cat._attrs["original_inputs"]
     new_cat_inputs = []
+    new_cat_original_inputs = []
+    new_cat_input_accessors = []
     i = 0
     while i < len(cat_inputs):
         matched = True
-        for j, _ in enumerate(first_op_outputs):
+        for j, _ in enumerate(split_op_outputs):
             if (i + j >= len(cat_inputs)) or (
-                cat_inputs[i + j] is not first_op_outputs[j]
+                cat_inputs[i + j] is not split_op_outputs[j]
             ):
                 matched = False
                 break
         if matched:
-            new_cat_inputs.extend(first_op._attrs["inputs"])
-            i += len(first_op_outputs)
+            # split doens't have "original_inputs" attribute
+            split_op_inputs = split_op._attrs["inputs"]
+            new_cat_inputs.extend(split_op_inputs)
+            new_cat_original_inputs.extend(split_op_inputs)
+            new_cat_input_accessors.extend([TensorAccessor(t) for t in split_op_inputs])
+            i += len(split_op_outputs)
         else:
             new_cat_inputs.append(cat_inputs[i])
+            new_cat_original_inputs.append(cat_original_inputs[i])
+            new_cat_input_accessors.append(cat._attrs["input_accessors"][i])
             i += 1
 
     for tensor in new_cat_inputs:
-        if tensor in first_op_outputs:
+        if tensor in split_op_outputs:
             return False
 
     cat._attrs["inputs"] = new_cat_inputs
@@ -91,13 +328,13 @@ def _try_merge_split_cat(first_op: Operator, cat: Operator) -> bool:
     # change this part later when we have TensorAccessors, depending on
     # the order of the transformations.
     assert all(cat._attrs["input_masks"])
-    cat._attrs["input_accessors"] = [TensorAccessor(t) for t in cat._attrs["inputs"]]
-    cat._attrs["original_inputs"] = list(new_cat_inputs)
+    cat._attrs["input_accessors"] = new_cat_input_accessors
+    cat._attrs["original_inputs"] = list(new_cat_original_inputs)
     cat._attrs["input_masks"] = [True] * len(new_cat_inputs)
-    for tensor in first_op_inputs:
-        tensor._attrs["dst_ops"].remove(first_op)
+    for tensor in split_op_inputs:
+        tensor._attrs["dst_ops"].discard(split_op)
         tensor._attrs["dst_ops"].add(cat)
-    for tensor in first_op_outputs:
+    for tensor in split_op_outputs:
         transform_utils.remove_tensor_from_sorted_graph(tensor)
     return True
 
@@ -122,16 +359,27 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
         cat = None
         found_cat_op = True
         for output_t in first_op._attrs["outputs"]:
-            if len(output_t._attrs["dst_ops"]) > 1:
+            # TODO: currently, we only allow concatenate output with multiple dst_ops.
+            # We may need to extend it to split ops.
+            if (
+                len(output_t._attrs["dst_ops"]) > 1
+                and first_op._attrs["op"] != "concatenate"
+            ):
                 found_cat_op = False
                 break
+            # If first op is output, it can't be fused.
+            if output_t._attrs["is_output"]:
+                found_cat_op = False
+                continue
             next_ops = output_t._attrs["dst_ops"]
-            if len(next_ops) != 1:
+            if len(next_ops) == 0:
                 break
-            next_op = list(next_ops)[0]
-            if next_op._attrs["op"] != "concatenate":
+            next_concats = [n for n in next_ops if n._attrs["op"] == "concatenate"]
+            # only support cases where first_cat is consumed by a single concat
+            if len(next_concats) != 1:
                 found_cat_op = False
                 break
+            next_op = next_concats[0]
             if cat is None:
                 cat = next_op
             if next_op is not cat:
@@ -150,12 +398,61 @@ def _merge_split_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noqa: C
             continue
 
         to_be_merged_ops.append([first_op, cat])
+        # only add first_op to the visited set to cases where
+        # we may have chained concat cases:
+        #     concat_0 = concat(x0...)
+        #     concat_1 = concat(concat_0...)
+        #     concat_2 = concat(concat_1...)
+        # where merging concat_0 and concat_1 is invalid but merging concat_1
+        # and concat_2 is valid. If we include both first_op and cat into
+        # the visited set, we would miss the opportunity of merging concat_1
+        # and concat_2.
         visited.add(first_op)
-        visited.add(cat)
 
+    updated_cat_cat = False
     for ops in to_be_merged_ops:
-        _try_merge_split_cat(ops[0], ops[1])
+        first_op_type = ops[0]._attrs["op"]
+        if first_op_type == "split":
+            _try_merge_split_cat(ops[0], ops[1])
+        elif first_op_type == "concatenate":
+            if _try_merge_cat_cat(ops[0], ops[1]):
+                updated_cat_cat = True
+        else:
+            raise AssertionError(f"unsupported {first_op_type=} for merging with cat")
+
+    # we adjusted input/output dependencies so need to run toposort again
+    if updated_cat_cat:
+        sorted_graph = toposort(sorted_graph)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
+
+
+def _eliminate_split_full_idx(sorted_graph: List[Tensor]) -> List[Tensor]:
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "split":
+            continue
+        split_op = src_op
+        dim = split_op._attrs["split_dim"]
+        split_sizes = split_op._attrs["split_sizes"]
+        assert len(split_op._attrs["inputs"]) == 1
+        shape = split_op._attrs["inputs"][0]._attrs["shape"]
+        if (
+            len(split_sizes) == 1
+            and shape_utils.is_static_dimension(shape, dim)
+            and shape[dim]._attrs["values"][0] == split_sizes[0]
+        ):
+            input_tensor = split_op._attrs["inputs"][0]
+            output_tensor = split_op._attrs["outputs"][0]
+            # tensor can not be input and output
+            if output_tensor._attrs["is_output"] and input_tensor._attrs["is_input"]:
+                continue
+            transform_utils.remove_single_tensor_op_from_sorted_graph(split_op)
 
+    sorted_graph = transform_utils.sanitize_sorted_graph(sorted_graph)
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
 
@@ -167,7 +464,9 @@ def transform_memory_ops(
     """
 
     funcs = [
+        _eliminate_split_full_idx,
         _merge_split_and_cat,
+        merge_slice_ops,
         _eliminate_cat,
     ]
     num_ops = None
diff --git a/python/aitemplate/compiler/transform/transform_merge_slice_ops.py b/python/aitemplate/compiler/transform/transform_merge_slice_ops.py
new file mode 100644
index 000000000..ec6dbf004
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_merge_slice_ops.py
@@ -0,0 +1,138 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This file implements a pass that merges consecutive slice ops if possible.
+"""
+from typing import List, Optional
+
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+
+from aitemplate.compiler.ops.tensor.dynamic_slice import MAX_INT32
+from aitemplate.compiler.transform import transform_utils
+
+from aitemplate.utils import shape_utils
+
+
+def _try_merge_slice_slice(
+    first_slice: Operator, second_slice: Operator, slice_dim: int
+) -> bool:
+    """
+    This function tries to merge two consecutive slice ops with the following
+    steps:
+        * update the start_indices and end_indices fields of the second_slice
+        * remove the first slice
+    """
+    first_slice_output = first_slice._attrs["outputs"][0]
+    first_slice_input_shape = first_slice._attrs["inputs"][0].shape()
+    second_slice_output = second_slice._attrs["outputs"][0]
+    second_slice_output_shape = second_slice_output.shape()
+    # note that all the dims of input_shape[slice_dim:] and output_shape[slice_dim:]
+    # are static at this point
+    for idx in range(slice_dim, first_slice_output._rank()):
+        first_slice_dim_offset = first_slice._attrs["start_indices"][idx]
+        # update the start and end indices of the second slice op
+        new_start = second_slice._attrs["start_indices"][idx] + first_slice_dim_offset
+        first_slice_input_dim = first_slice_input_shape[idx].value()
+        # new start index exceeds the corresponding dim value of the first slice input shape
+        if new_start >= first_slice_input_dim:
+            return False
+        new_end = new_start + second_slice_output_shape[idx].value()
+        # new end index exceeds the corresponding dim value of the first slice input shape
+        if new_end > first_slice_input_dim:
+            return False
+        first_slice_end = first_slice._attrs["end_indices"][idx]
+        second_slice_end = second_slice._attrs["end_indices"][idx]
+        if first_slice_end == MAX_INT32 == second_slice_end:
+            new_end = MAX_INT32
+        second_slice._attrs["start_indices"][idx] = new_start
+        second_slice._attrs["end_indices"][idx] = new_end
+    # remove the old strided op from the first cat's dst_ops
+    transform_utils.remove_single_tensor_op_from_sorted_graph(first_slice)
+    return True
+
+
+def _check_slice_op(slice_op: Operator, slice_dim: int) -> bool:
+    """
+    Return True if the slice_op's indices are valid for being merged
+    """
+    slice_shape = slice_op._attrs["outputs"][0].shape()
+    if not shape_utils.all_static_dimensions(slice_shape, slice_dim):
+        return False
+    # we expect normalized start_indices and end_indices
+    start_index = slice_op._attrs["start_indices"][slice_dim]
+    if start_index is None or start_index < 0:
+        return False
+    end_index = slice_op._attrs["end_indices"][slice_dim]
+    if end_index is None or end_index < 0 or end_index <= start_index:
+        return False
+    return True
+
+
+def _get_rightmost_non_dynamic_dim(shape: List[IntVar]) -> Optional[int]:
+    """
+    Return the index of the rightmost non-dynamic dim. For example, given
+    a shape [3, dyn_dim, 4, 1], it would return 2, which is the index of the
+    third dim.
+    Return None if shape[-1] is dynamic.
+    """
+    idx = 0
+    for dim in reversed(shape):
+        if not isinstance(dim, IntImm):
+            break
+        idx += 1
+    if idx == 0:
+        return None
+    return len(shape) - idx
+
+
+def merge_slice_ops(sorted_graph: List[Tensor]) -> List[Tensor]:
+    # a list of tuple(first_slice, second_slice, slice_dim)
+    to_be_merged = []
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        if len(src_ops) != 1:
+            continue
+        src_op = list(src_ops)[0]
+        if src_op._attrs["op"] != "dynamic_slice":
+            continue
+        first_slice = src_op
+        first_slice_output = first_slice._attrs["outputs"][0]
+        if first_slice_output._attrs["is_output"]:
+            continue
+        slice_dim = _get_rightmost_non_dynamic_dim(first_slice_output.shape())
+        if slice_dim is None:
+            continue
+        if not _check_slice_op(first_slice, slice_dim):
+            continue
+        next_ops = first_slice_output._attrs["dst_ops"]
+        if len(next_ops) != 1:
+            continue
+        next_op = next_ops[0]
+        if next_op._attrs["op"] != "dynamic_slice":
+            continue
+        second_slice = next_op
+        second_slice_output = second_slice._attrs["outputs"][0]
+        if first_slice_output._rank() != second_slice_output._rank():
+            continue
+        second_slice_dim = _get_rightmost_non_dynamic_dim(second_slice_output.shape())
+        if slice_dim != second_slice_dim:
+            continue
+        if not _check_slice_op(second_slice, slice_dim):
+            continue
+        to_be_merged.append([first_slice, second_slice, slice_dim])
+
+    for first_slice, second_slice, slice_dim in to_be_merged:
+        _try_merge_slice_slice(first_slice, second_slice, slice_dim)
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/transform_merge_view_ops.py b/python/aitemplate/compiler/transform/transform_merge_view_ops.py
new file mode 100644
index 000000000..a863df642
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_merge_view_ops.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+This file implements a pass that merges consecutive view ops if possible.
+"""
+from typing import List, Set
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.utils.shape_utils import convert_shape_to_IntVarTensor
+
+
+_VIEW_OPS = {"reshape", "flatten", "squeeze", "unsqueeze"}
+
+
+def _is_inout(t: Tensor):
+    return t._attrs["is_input"] or t._attrs["is_output"]
+
+
+def _merge_view_ops_for(graph: List[Tensor], tensor: Tensor) -> List[Tensor]:
+    """
+    `tensor` should have exactly 1 src op, and that op must be a view op. We
+    will look for view ops in the dst ops and merge them with the src view op
+    by creating a new reshape op.
+    """
+    src_op = tensor._attrs["src_ops"][0]
+    in_tensor = src_op._attrs["inputs"][0]
+    dst_ops = tensor._attrs["dst_ops"]
+    removed_ops: Set[Operator] = set()
+    for op in dst_ops:
+        if op._attrs["op"] not in _VIEW_OPS:
+            continue
+        out_tensor = op._attrs["outputs"][0]
+        in_shape = in_tensor._attrs["shape"]
+        out_shape = out_tensor._attrs["shape"]
+        if out_shape == in_shape and not (
+            _is_inout(in_tensor) and _is_inout(out_tensor)
+        ):
+            # If the shapes are identical, we can eliminate both view ops
+            transform_utils.replace_tensor(out_tensor, in_tensor)
+        else:
+            # Otherwise, create a new reshape op to replace the two view ops
+            out_shape = convert_shape_to_IntVarTensor(out_tensor)
+            new_out_tensor = ops.reshape()(in_tensor, out_shape)
+            if out_tensor._attrs["is_output"]:
+                new_out_tensor._attrs["is_output"] = True
+                new_out_tensor._attrs["name"] = out_tensor._attrs["name"]
+            transform_utils.replace_tensor(out_tensor, new_out_tensor)
+            graph.append(new_out_tensor)
+        graph.remove(out_tensor)
+        removed_ops.add(op)
+    for op in removed_ops:
+        transform_utils.remove_view_op_from_sorted_graph(op)
+    return graph
+
+
+def merge_view_ops(sorted_graph: List[Tensor], workdir: str = None) -> List[Tensor]:
+    """
+    Merge consecutive view ops.
+    """
+    changed = False
+    # Find pairs of consecutive view ops and merge them, iterating to a
+    # fixpoint.
+    # TODO: Instead of merging pairs of view ops, we should look for entire
+    # chains of view ops and merge them all at once.
+    while True:
+        for tensor in sorted_graph:
+            src_ops = tensor._attrs["src_ops"]
+            if len(src_ops) != 1:
+                continue
+            src_op = list(src_ops)[0]
+            if src_op._attrs["op"] not in _VIEW_OPS:
+                continue
+            dst_ops = tensor._attrs["dst_ops"]
+            if any(op._attrs["op"] in _VIEW_OPS for op in dst_ops):
+                # NOTE: _merge_view_ops_for does *not* return a sorted graph
+                sorted_graph = _merge_view_ops_for(sorted_graph, tensor)
+                changed = True
+                break
+        else:
+            break
+
+    if changed:
+        # Prune tensors that may have become unused after view op merging
+        sorted_graph = toposort([t for t in sorted_graph if t._attrs["is_output"]])
+        return transform_utils.sanitize_sorted_graph(toposort(sorted_graph))
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_odd_alignment.py b/python/aitemplate/compiler/transform/transform_odd_alignment.py
index 0e9b9414d..77f4de59a 100644
--- a/python/aitemplate/compiler/transform/transform_odd_alignment.py
+++ b/python/aitemplate/compiler/transform/transform_odd_alignment.py
@@ -18,17 +18,21 @@
 from math import inf
 from typing import Dict, List, Tuple
 
-from ..base import IntImm, IntVar, Operator, Tensor
-from ..ops.common.view_ops import unsqueeze
-from ..ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
-from ..ops.tensor import permute021
-
-from .apply_padding import _get_padding_length
-from .fuse_utils import extract_only_one_op
-from .toposort import toposort
-from .transform_strided_ops import _is_supported_op as _is_supported_strided_op
-from .transform_strided_slice import _is_supported_op as _is_supported_strided_slice
-from .transform_utils import (
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.common.view_ops import unsqueeze
+from aitemplate.compiler.ops.gemm_universal import bmm_ccr, bmm_crr, bmm_rcr, bmm_rrr
+from aitemplate.compiler.ops.tensor import permute021
+
+from aitemplate.compiler.transform.apply_padding import get_padding_length
+from aitemplate.compiler.transform.fuse_utils import extract_only_one_op
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.compiler.transform.transform_strided_ops import (
+    _is_supported_op as _is_supported_strided_op,
+)
+from aitemplate.compiler.transform.transform_strided_slice import (
+    _is_supported_op as _is_supported_strided_slice,
+)
+from aitemplate.compiler.transform.transform_utils import (
     can_be_constant_folded,
     copy_src_op_attributes,
     copy_tensor_attributes,
@@ -63,7 +67,7 @@ def _compute_padding_flops(
     elif _is_strided_tensor(tensor):
         return (
             _matrix_shape_prod(shapes)
-            * _get_padding_length(shapes[padding_idx].value())
+            * get_padding_length(shapes[padding_idx].value(), tensor.dtype())
             / shapes[padding_idx].value()
         )
     else:
@@ -81,7 +85,9 @@ def _compute_slicing_flops(mm_op: Operator, slicing_dim: int, other_dim: int) ->
             can_be_fused = False
 
     if can_be_fused:
-        return other_dim * _get_padding_length(slicing_dim)
+        return other_dim * get_padding_length(
+            slicing_dim, mm_op._attrs["inputs"][0].dtype()
+        )
     else:
         return other_dim * slicing_dim
 
@@ -222,6 +228,12 @@ def _transform_odd_alignment(
         op_type = src_op._attrs["op"]
         if op_type not in permutable_pairs:
             continue
+        # FIXME: This pass only works for half type. We may need to change it to
+        # work with other types such as int8 later. Note that for float type, it
+        # is safe to skip, because gemm/bmm with float inputs always meet alignment
+        # requirements.
+        if src_op._attrs["inputs"][0].dtype() != "float16":
+            continue
 
         perm_type = ([False, False], [False, True], [True, False], [True, True])
         permute_input = [False, False]
diff --git a/python/aitemplate/compiler/transform/transform_permutations.py b/python/aitemplate/compiler/transform/transform_permutations.py
new file mode 100644
index 000000000..4b9379c29
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_permutations.py
@@ -0,0 +1,136 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from typing import List
+
+import numpy as np
+
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+from aitemplate.compiler.transform import transform_utils
+
+
+NAME_TO_DIM = {
+    "permute021": [0, 2, 1],
+    "permute210": [2, 1, 0],
+    "permute102": [1, 0, 2],
+    "permute0213": [0, 2, 1, 3],
+}
+
+
+def get_permutation(op: Operator):
+    if op._attrs["op"] == "permute":
+        permutation = list(op._attrs["dims"])
+    elif op._attrs["op"] in NAME_TO_DIM:
+        permutation = NAME_TO_DIM[op._attrs["op"]]
+    else:
+        raise NotImplementedError(
+            f"Not implemented for permute operation: {op._attrs['op']}"
+        )
+    return permutation
+
+
+def remove_second_permutation_from_graph(
+    permutation_1: Operator, permutation_2: Operator
+):
+    input_tensor_p1 = permutation_1._attrs["inputs"][0]
+    input_tensor_p2 = permutation_2._attrs["inputs"][0]
+    output_tensor = permutation_2._attrs["outputs"][0]
+
+    input_tensor_p1._attrs["dst_ops"].update(output_tensor._attrs["dst_ops"])
+    input_tensor_p2._attrs["dst_ops"].discard(permutation_2)
+
+    for dst_op in output_tensor._attrs["dst_ops"]:
+        dst_op.replace_input_tensor(output_tensor, input_tensor_p1)
+
+    if output_tensor._attrs["is_output"]:
+        input_tensor_p1._attrs["is_output"] = True
+        input_tensor_p1._attrs["name"] = output_tensor._attrs["name"]
+
+    transform_utils.remove_tensor_from_sorted_graph(output_tensor)
+
+
+def _reshaped_or_strided_input_or_output_accessor(op: Operator) -> bool:
+    def _reshaped_or_strided_tensor_accessor(accessor: TensorAccessor) -> bool:
+        if (
+            accessor.actual_shapes is not None
+            and accessor.actual_shapes != accessor.original_shapes
+        ):
+            return True
+
+        # Is it a strided accessor
+        if hasattr(accessor, "stride_dim") and accessor.stride_dim is not None:
+            return True
+
+        return False
+
+    input_accessors = op._attrs.get("input_accessors", None)
+    output_accessors = op._attrs.get("output_accessors", None)
+
+    return (
+        (input_accessors is not None)
+        and _reshaped_or_strided_tensor_accessor(input_accessors[0])
+    ) or (
+        (output_accessors is not None)
+        and _reshaped_or_strided_tensor_accessor(output_accessors[0])
+    )
+
+
+def eliminate_permutations(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    if len(sorted_graph) < 2:
+        return sorted_graph
+    removed_op = set()
+    for tensor in sorted_graph:
+        src_ops = tensor._attrs["src_ops"]
+        for cur_op in src_ops:
+            if cur_op in removed_op:
+                continue
+            if not cur_op._attrs["op"].startswith("permute"):
+                continue
+            if _reshaped_or_strided_input_or_output_accessor(cur_op):
+                continue
+            curr_op_output = cur_op._attrs["outputs"][0]
+            dst_ops = curr_op_output._attrs["dst_ops"]
+            n_dst_ops = len(dst_ops)
+            if n_dst_ops == 0:
+                continue
+            remove_list = []
+            for next_op in dst_ops:
+                if not next_op._attrs["op"].startswith("permute"):
+                    continue
+                if _reshaped_or_strided_input_or_output_accessor(next_op):
+                    continue
+                p1 = get_permutation(cur_op)
+                p2 = get_permutation(next_op)
+                if len(p1) != len(p2):
+                    continue
+                if not np.all(np.array(p1)[p2] == np.arange(0, len(p1))):
+                    continue
+                is_input = cur_op._attrs["inputs"][0]._attrs["is_input"]
+                is_output = next_op._attrs["outputs"][0]._attrs["is_output"]
+                if is_input and is_output:
+                    continue
+                remove_list.append(next_op)
+
+            for next_op in remove_list:
+                remove_second_permutation_from_graph(cur_op, next_op)
+                removed_op.add(next_op)
+
+            if len(remove_list) == n_dst_ops:
+                transform_utils.remove_single_tensor_op_from_sorted_graph(cur_op)
+                removed_op.add(cur_op)
+
+    return transform_utils.sanitize_sorted_graph(sorted_graph)
diff --git a/python/aitemplate/compiler/transform/transform_permute_to_reshape.py b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
new file mode 100644
index 000000000..1aa21242e
--- /dev/null
+++ b/python/aitemplate/compiler/transform/transform_permute_to_reshape.py
@@ -0,0 +1,140 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Transform permute to reshape wherever applicable.
+"""
+from typing import List
+
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops import reshape
+from aitemplate.compiler.transform import transform_utils
+from aitemplate.compiler.transform.toposort import toposort
+
+from aitemplate.utils import graph_utils
+
+
+def _check_permute_to_reshape(op: Operator) -> bool:
+    """Check if applicable to replace permute with reshape.
+
+    Args:
+        op (Operator): reshape op
+
+    Returns:
+        bool: False if operation is not a permute or a permute with memory
+            layout modification otherwise True.
+    """
+    if not op._attrs["op"].startswith("permute"):
+        return False
+
+    inputs = op._attrs["inputs"]
+
+    assert (
+        len(inputs) == 1
+    ), "Permute operation {} should have 1 input, got {} instead".format(
+        op._attrs["op"], len(inputs)
+    )
+
+    if "input_accessors" in op._attrs:
+        input_shape = op._attrs["input_accessors"][0].original_shapes
+    else:
+        input_shape = inputs[0].shape()
+
+    if op._attrs["op"] == "permute":
+        permutation = list(op._attrs["dims"])
+    elif op._attrs["op"] == "permute021":
+        n_dims = len(input_shape)
+        permutation = list(range(n_dims - 2)) + [n_dims - 1, n_dims - 2]
+    elif op._attrs["op"] == "permute102":
+        permutation = [1, 0, 2]
+    elif op._attrs["op"] == "permute210":
+        permutation = [2, 1, 0]
+    elif op._attrs["op"] == "permute0213":
+        permutation = [0, 2, 1, 3]
+    else:
+        raise NotImplementedError(
+            f"Not implemented for permute operation: {op._attrs['op']}"
+        )
+    if "input_accessors" in op._attrs:
+        # Can't convert permute to reshape if one of the dimensions included
+        # in permutation is strided
+        ta = op._attrs["input_accessors"][0]
+        if ta.is_from_strided_tensor and ta.stride_dim in permutation:
+            return False
+    # Get non-singular dimension indices
+    permutation = [
+        dim_idx
+        for dim_idx in permutation
+        if not isinstance(input_shape[dim_idx], IntImm)
+        or input_shape[dim_idx].value() != 1
+    ]
+    is_reshape = permutation == sorted(permutation)
+    return is_reshape
+
+
+def transform_permute_to_reshape(
+    sorted_graph: List[Tensor], workdir: str = None
+) -> List[Tensor]:
+    """Convert permute to reshape wherever applicable.
+
+    When permute op involves moving one or more dimensions with size
+    1 around where the order of non-singular dimensions is preserved,
+    it's basically a reshape op, i.e. the underlying memory layout
+    does not change.
+
+    If a permute op has a non-empty input tensor accessor, its original shape
+    should be used to determine whether it can be converted to reshape.
+    In this case the shape of the actual input tensor might not match the rank
+    of the permutation (but the original shape does) - see the second
+    example below.
+
+    Examples:
+        [256x5x1x32] -> [256x5x32x1] (with 0132) is a reshape
+        [256x5x32] -> [256x5x1x32] (with 0132) is a reshape
+        [256x1x5x1x32] -> [256x5x32x1x1] (with 02431) is a reshape
+        [256x5x1x32] -> [256x32x5x1] (with 0312) is not a reshape
+
+    Args:
+        sorted_graph (List[Tensor]): input graph
+        workdir (str, optional): current workdir for dumping debug info. Defaults to None.
+
+    Returns:
+        List[Tensor]: optimized graph
+    """
+    sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+
+    has_modified = False
+    for op in sorted_ops:
+        if not _check_permute_to_reshape(op):
+            continue
+
+        has_modified = True
+
+        permute_input = op._attrs["inputs"][0]
+        permute_output = op._attrs["outputs"][0]
+        output_shape = permute_output.shape()
+
+        transform_utils.remove_dst_op_from_tensor(permute_input, op)
+
+        reshape_op = reshape()
+        reshape_output = reshape_op(permute_input, output_shape)
+
+        transform_utils.replace_tensor(permute_output, reshape_output)
+
+        sorted_graph.append(reshape_output)
+
+    if has_modified:
+        sorted_graph = toposort(sorted_graph)
+        transform_utils.sanitize_sorted_graph(sorted_graph)
+    return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_special_ops.py b/python/aitemplate/compiler/transform/transform_special_ops.py
index 79f5ea5a9..202c99794 100644
--- a/python/aitemplate/compiler/transform/transform_special_ops.py
+++ b/python/aitemplate/compiler/transform/transform_special_ops.py
@@ -18,15 +18,14 @@
 """
 from typing import Callable, List, Tuple, Type, Union
 
-from aitemplate.utils.shape_utils import is_singleton_dimension
-from ...backend.target import Target
-
-from .. import ops
-from ..base import Operator, Tensor
-from ..ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
-from ..ops.gemm_universal.bmm_rcr import bmm_rcr
-from ..ops.gemm_universal.gemm_rrr import gemm_rrr
-from .transform_utils import (
+from aitemplate.backend.target import Target
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Operator, Tensor
+from aitemplate.compiler.ops.gemm_special.gemm_rrr_small_nk import gemm_rrr_small_nk
+from aitemplate.compiler.ops.gemm_universal.bmm_xxx import bmm_rcr
+from aitemplate.compiler.ops.gemm_universal.gemm_rrr import gemm_rrr
+from aitemplate.compiler.transform.transform_utils import (
     copy_src_op_attributes,
     copy_tensor_attributes,
     remove_dst_op_from_tensor,
@@ -34,6 +33,8 @@
     sanitize_sorted_graph,
 )
 
+from aitemplate.utils.shape_utils import is_singleton_dimension
+
 # pylint: disable=C0103,C0415,W0612
 
 
@@ -198,12 +199,11 @@ def match_func(tensor: Tensor) -> bool:
         if src_op._attrs["op"] not in conv_to_gemm:
             return False
 
-        if (
-            src_op._attrs["pad"] != 0
-            or src_op._attrs["dilate"] != 1
-            or src_op._attrs["group"] != 1
-            or src_op._attrs["stride"] != 1
-        ):
+        valid_pad = src_op._attrs["pad"] == 0 or src_op._attrs["pad"] == (0, 0)
+        valid_dilate = src_op._attrs["dilate"] == 1 or src_op._attrs["dilate"] == (1, 1)
+        valid_stride = src_op._attrs["stride"] == 1 or src_op._attrs["stride"] == (1, 1)
+        valid_group = src_op._attrs["group"] == 1
+        if not valid_pad or not valid_dilate or not valid_stride or not valid_group:
             return False
 
         # Check that the filter is 1x1
@@ -297,8 +297,8 @@ def transform_special_ops(
         _transform_1x1_conv_gemm_rcr,
     ]
 
-    if "transform_conv_to_gemm" in Target.current()._kwargs:
-        if Target.current()._kwargs["transform_conv_to_gemm"]:
+    if "convert_conv_to_gemm" in Target.current()._kwargs:
+        if Target.current()._kwargs["convert_conv_to_gemm"]:
             for func in funcs:
                 sorted_graph = func(sorted_graph)
     return sorted_graph
diff --git a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
index d1dc9fecf..b7be3a6ba 100644
--- a/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
+++ b/python/aitemplate/compiler/transform/transform_strided_op_and_view_op.py
@@ -16,7 +16,6 @@
 Perform transformations to fuse view ops with strided op by using TensorAccessor.
 """
 
-import logging
 from typing import List
 
 from aitemplate.compiler.base import Operator, Tensor
@@ -25,19 +24,13 @@
 from aitemplate.compiler.transform import transform_utils
 from aitemplate.utils import graph_utils
 
-logger = logging.getLogger(__name__)
 
 _VIEW_OPS = {"reshape", "flatten", "squeeze", "unsqueeze"}
 
 
 def _is_supported_strided_op(op: Operator) -> bool:
-    from ...backend.target import Target
-
     op_kind = op._attrs["op"]
-    if Target.current().name() == "rocm":
-        return op_kind == "bmm_softmax_bmm_permute"
-    else:
-        return not op_kind.startswith(("group_gemm", "concatenate"))
+    return not op_kind.startswith("group_gemm")
 
 
 def _is_supported_view_op(op: Operator, tensor: Tensor) -> bool:
@@ -96,6 +89,13 @@ def _fuse_strided_op_and_view_op_single_pass(
         else:
             if tensor._attrs["is_output"]:
                 continue
+            # We have special handling for group_gemm + reshape + concat
+            # in transform_strided_ops, so we skip group_gemm at the moment.
+            # Otherwise, we would end up with shape mismatch due to fusing
+            # the view op. We may relax this constraint if we remove the special
+            # pass above.
+            if src_op is not None and src_op._attrs["op"].startswith("group_gemm"):
+                continue
             to_be_removed_dst_ops = set()
             for dst_op in tensor._attrs["dst_ops"]:
                 if (
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops.py b/python/aitemplate/compiler/transform/transform_strided_ops.py
index e2ef4d20a..21a1d665c 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops.py
@@ -16,25 +16,30 @@
 Perform transformations on ops which support strided inputs / outputs.
 """
 import functools
-import logging
 
 from typing import List
 
+from aitemplate.compiler.base import IntImm, Operator, Tensor
+from aitemplate.compiler.ops.tensor.slice_reshape_scatter import slice_reshape_scatter
+from aitemplate.compiler.ops.tensor.slice_scatter import slice_scatter
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.transform.fuse_split import (
+    _fuse_split_and_group_gemm,
+    _fuse_split_and_strided_op,
+)
+from aitemplate.compiler.transform.transform_strided_op_and_view_op import (
+    _fuse_strided_op_and_view_op,
+)
+from aitemplate.compiler.transform.transform_strided_slice import (
+    _fuse_slice_and_strided_op,
+)
+
 from aitemplate.testing import detect_target
 
-from ...utils import graph_utils, shape_utils
-from ..base import IntImm, Operator, Tensor
-from ..ops.tensor.slice_reshape_scatter import slice_reshape_scatter
-from ..ops.tensor.slice_scatter import slice_scatter
-from . import transform_strided_ops_utils, transform_utils
-from .fuse_split import _fuse_split_and_group_gemm, _fuse_split_and_strided_op
-from .transform_strided_op_and_view_op import _fuse_strided_op_and_view_op
-from .transform_strided_slice import _fuse_slice_and_strided_op
+from aitemplate.utils import graph_utils, shape_utils
 
 # pylint: disable=W0612
 
-logger = logging.getLogger(__name__)
-
 
 def _fuse_slices_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
     for tensor in sorted_graph:
@@ -46,7 +51,7 @@ def _fuse_slices_concat(sorted_graph: List[Tensor]) -> List[Tensor]:
             continue
         concat_op = src_op
         if slice_scatter.is_valid(concat_op):
-            slice_scatter(concat_op)
+            slice_scatter.make_op(concat_op)
 
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
@@ -83,7 +88,7 @@ def _fuse_slices_concat_reshape_concat(sorted_graph: List[Tensor]) -> List[Tenso
 
         concat_op_2 = next_op
         if slice_reshape_scatter.is_valid(concat_op, reshape_op, concat_op_2):
-            slice_reshape_scatter(concat_op, reshape_op, concat_op_2)
+            slice_reshape_scatter.make_op(concat_op, reshape_op, concat_op_2)
 
     return transform_utils.sanitize_sorted_graph(sorted_graph)
 
@@ -132,7 +137,7 @@ def _group_gemm_cat_checker(
 
 def _is_bmm(op_type: str) -> bool:
     # TODO: support cutlass bmm ops
-    return op_type.startswith("bmm_rcr")
+    return op_type.startswith(("bmm_rcr", "bmm_crr", "bmm_ccr", "bmm_rrr"))
 
 
 def _bmm_checker(bmm_op: Operator, cat_op: Operator) -> bool:
@@ -172,10 +177,6 @@ def _is_layernorm(op_type: str) -> bool:
     return op_type.startswith("layernorm") or op_type.startswith("group_layernorm")
 
 
-def _layernorm_cat_checker(cat_op: Operator) -> bool:
-    return cat_op._attrs["concat_dim"] in [0, 1]
-
-
 def _is_reduce_op(op_type: str) -> bool:
     return op_type in {"reduce_sum", "reduce_mean", "var", "vector_norm"}
 
@@ -215,8 +216,6 @@ def _is_valid_for_fusion(strided_op: Operator, cat_op: Operator, out_idx: int):
         return _gemm_cat_checker(strided_op, cat_op)
     if _is_strided_group_gemm(strided_op):
         return _group_gemm_cat_checker(strided_op, cat_op, out_idx)
-    if _is_layernorm(op_type):
-        return _layernorm_cat_checker(cat_op)
     if _is_bmm(op_type):
         return _bmm_checker(strided_op, cat_op)
     if _is_perm102_bmm(op_type):
@@ -272,6 +271,8 @@ def _fuse_strided_op_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noq
             src_ops = list(cat_input.src_ops())
             if len(src_ops) != 1 or len(cat_input.dst_ops()) != 1:
                 continue
+            if cat_input._attrs["is_output"]:
+                continue
             strided_op = src_ops[0]
             if not _is_supported_op(strided_op):
                 continue
@@ -292,12 +293,15 @@ def _fuse_strided_op_and_cat(sorted_graph: List[Tensor]) -> List[Tensor]:  # noq
 
             offset = 0
 
+            # cat's inputs may have been updated for cases like view_op + cat.
+            # So, we need to retrieve original shapes from its input accessors.
+            cat_input_accessors = cat_op._attrs["input_accessors"]
             # This pass must run before any other pass that remove cat inputs, like
             # _fuse_strided_op_reshape_cat
             for orig_i in range(idx):
-                input_tensor = cat_inputs[orig_i]
+                input_accessor = cat_input_accessors[orig_i]
                 # TODO: Add dynamic shape support.
-                offset += input_tensor._attrs["shape"][cat_dim].value()
+                offset += input_accessor.original_shapes[cat_dim].value()
 
             cat_inputs_to_remove.append(idx)
 
@@ -478,6 +482,7 @@ def transform_strided_ops(
     else:
         funcs = [
             # Keep on ROCM
+            _fuse_slices_concat_reshape_concat,
             _fuse_strided_op_and_view_op,
             _fuse_strided_op_and_cat,
             _fuse_split_and_strided_op,
diff --git a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
index e04107649..da56a4cd9 100644
--- a/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
+++ b/python/aitemplate/compiler/transform/transform_strided_ops_utils.py
@@ -20,7 +20,8 @@
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
 from aitemplate.compiler.tensor_accessor import TensorAccessor
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _dynamic_shape_checker(shape: List[IntVar], dim: int) -> bool:
@@ -77,8 +78,14 @@ def gemm_stride_checker(
 
     # Need to make sure that the new stride dim doesn't break
     # last dim's continuity.
-    # This is because CUTLASS gemm API assumes that gemm stride
-    # only operates on the last dim.
+    # This is because CUTLASS GEMM API assumes that GEMM stride
+    # only operates on the last dim for row-major output.
+    # For example, concatenations of GEMMs along dimensions to the right of the
+    # original shape can't be fused. A particular case of this is when GEMM
+    # output of shape (M, N) is unsqueezed to (M, N, 1) and concatenated with
+    # another (M, N, 1).
+    if not original_ta.is_rightmost_dim_contiguous(dim):
+        return False
 
     if get_stride_at_dim is None:
         # The dim before the last dim
@@ -98,7 +105,7 @@ def gemm_stride_checker(
     # TODO: Make this configurable for different gemms, bmms, etc.
     stride_strs = tmp_ta.try_get_stride_strs(get_stride_at_dim)
     if stride_strs is None:
-        logger.debug(
+        _LOGGER.debug(
             f"Failed in gemm_stride_checker: "
             f"dim: {dim}, "
             f"original_shapes length: {len(original_ta.original_shapes)}"
diff --git a/python/aitemplate/compiler/transform/transform_strided_slice.py b/python/aitemplate/compiler/transform/transform_strided_slice.py
index 167a8e444..72397c2b8 100644
--- a/python/aitemplate/compiler/transform/transform_strided_slice.py
+++ b/python/aitemplate/compiler/transform/transform_strided_slice.py
@@ -19,27 +19,27 @@
 
 from typing import List
 
-from ...utils import graph_utils
-from ..base import IntImm, IntVar, Operator, Tensor
-from ..ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
-from . import transform_strided_ops_utils, transform_utils
+from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.ops.tensor.dynamic_slice import dynamic_slice, MAX_INT32
+from aitemplate.compiler.transform import transform_strided_ops_utils, transform_utils
 
+from aitemplate.utils import alignment as utils_alignment, graph_utils, shape_utils
 
-def _is_supported_gemm(gemm_op: Operator, slice_op: Operator) -> bool:
+
+def _is_supported_gemm_or_bmm(gemm_or_bmm_op: Operator, slice_op: Operator) -> bool:
+    if not gemm_or_bmm_op._attrs["op"].startswith(("gemm_rcr", "bmm")):
+        return False
     slice_output_tensor = slice_op._attrs["outputs"][0]
     slice_output_rank = slice_output_tensor._rank()
-    # TODO: support other gemm kinds
-    if gemm_op._attrs["op"].startswith("gemm_rcr"):
-        # TODO: support cases where slice_input_tensor is used by non-A/B
-        # matrices, e.g. bias/d1/d2 in gemm_rcr_bias_add_add
-        gemm_inputs = gemm_op._attrs["inputs"]
-        if (
-            gemm_inputs[0] is not slice_output_tensor
-            and gemm_inputs[1] is not slice_output_tensor
-        ):
-            return False
-        return slice_output_rank >= 2
-    return False
+    # TODO: support cases where slice_input_tensor is used by non-A/B
+    # matrices, e.g. bias/d1/d2 in gemm_rcr_bias_add_add
+    op_inputs = gemm_or_bmm_op._attrs["inputs"]
+    if (
+        op_inputs[0] is not slice_output_tensor
+        and op_inputs[1] is not slice_output_tensor
+    ):
+        return False
+    return slice_output_rank >= 2
 
 
 def _sanity_check_concatenate(concat_op: Operator, slice_op: Operator) -> bool:
@@ -60,11 +60,11 @@ def _sanity_check_concatenate(concat_op: Operator, slice_op: Operator) -> bool:
 
 def _is_supported_op(op: Operator, slice_op: Operator) -> bool:
     op_type = op._attrs["op"]
-    if op_type.startswith("gemm"):
-        return _is_supported_gemm(op, slice_op)
+    if op_type.startswith(("bmm", "gemm")):
+        return _is_supported_gemm_or_bmm(op, slice_op)
     if op_type == "concatenate":
         return _sanity_check_concatenate(op, slice_op)
-    if op_type == "fused_elementwise":
+    if op_type == "fused_elementwise" or op_type == "permute021":
         return True
     if op_type.startswith("layernorm") or op_type.startswith("group_layernorm"):
         return True
@@ -88,18 +88,26 @@ def _is_slice_full_range(dim: IntVar, start_idx: int, end_idx: int) -> bool:
 
 def _valid_alignment(
     op: Operator,
+    slice_dim: int,
+    slice_output_tensor: Tensor,
     slice_input_shape: List[IntVar],
     start_indices: List[int],
     end_indices: List[int],
 ) -> bool:
     op_type = op._attrs["op"]
     if (
-        op_type in ("fused_elementwise", "concatenate")
-        or op._attrs["op"].startswith("layernorm")
-        or op._attrs["op"].startswith("group_layernorm")
+        op_type in ("fused_elementwise", "concatenate", "permute021")
+        or op_type.startswith("layernorm")
+        or op_type.startswith("group_layernorm")
     ):
         return True
 
+    dtype = slice_output_tensor.dtype()
+    stride = shape_utils.get_static_stride(slice_input_shape, slice_dim)
+    assert (
+        stride is not None
+    ), f"expected non-None stride for {slice_input_shape=} at {slice_dim=}"
+    start_offset = start_indices[slice_dim] * stride
     if op_type.startswith("gemm_rcr"):
         # for n-d * 2-d cases, we are only able to support a special case
         # where we fully slice all axes except the last one (i.e. -1), because
@@ -121,8 +129,31 @@ def _valid_alignment(
         k_dim = slice_input_shape[-1]
         if not isinstance(k_dim, IntImm):
             return False
-        alignment = math.gcd(k_dim.value(), start_indices[-1])
-        return alignment % 2 == 0
+        alignment = math.gcd(k_dim.value(), start_offset)
+        return utils_alignment.valid_alignment(alignment, dtype)
+
+    if op_type.startswith("bmm"):
+        bmm_inputs = op._attrs["inputs"]
+        if bmm_inputs[0] is slice_output_tensor:
+            # _get_a_leading_dim(m, k)
+            leading_dim = op._get_a_leading_dim(
+                slice_input_shape[op._get_m_idx_in_a(slice_input_shape)],
+                slice_input_shape[op._get_k_idx_in_a(slice_input_shape)],
+            )
+        elif bmm_inputs[1] is slice_output_tensor:
+            # _get_a_leading_dim(n, k)
+            leading_dim = op._get_b_leading_dim(
+                slice_input_shape[op._get_n_idx_in_b(slice_input_shape)],
+                slice_input_shape[op._get_k_idx_in_b(slice_input_shape)],
+            )
+        else:
+            # TODO: support strided access for other inputs
+            return False
+        if not isinstance(leading_dim, IntImm):
+            return False
+        alignment = math.gcd(leading_dim.value(), start_offset)
+        return utils_alignment.valid_alignment(alignment, dtype)
+
     return False
 
 
@@ -201,6 +232,8 @@ def _process_one_slice_dst(
     # Now let's check alignment
     if not _valid_alignment(
         strided_op,
+        slice_dim,
+        slice_output_tensor,
         slice_input_shape,
         normalized_start_indices,
         normalized_end_indices,
@@ -212,6 +245,8 @@ def _process_one_slice_dst(
         if input_tensor is not slice_output_tensor:
             continue
         input_accessors = strided_op._attrs["input_accessors"]
+        if input_accessors[idx].stride_dim is not None:
+            return False
 
         if any(strided_op_name.startswith(n) for n in ("gemm", "group_gemm", "bmm")):
             if not transform_strided_ops_utils.gemm_stride_checker(
diff --git a/python/aitemplate/compiler/transform/transform_utils.py b/python/aitemplate/compiler/transform/transform_utils.py
index ca66bea8b..1c5caf4d9 100644
--- a/python/aitemplate/compiler/transform/transform_utils.py
+++ b/python/aitemplate/compiler/transform/transform_utils.py
@@ -16,16 +16,21 @@
 Util functions for graph transformations.
 """
 
+import logging
 from collections import deque
 from typing import Dict, List, Union
 
+from aitemplate.compiler.base import Operator, Tensor
+
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform.mark_param_tensor import mark_param_tensor
+from aitemplate.compiler.transform.name_graph import name_graph
+from aitemplate.compiler.transform.remove_unused_ops import remove_unused_ops
+
+from aitemplate.utils import graph_utils
 
-from ...utils import graph_utils, logger
-from ..base import Operator, Tensor
-from .mark_param_tensor import mark_param_tensor
-from .name_graph import name_graph
-from .remove_unused_ops import remove_unused_ops
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def check_graph_validity(sorted_graph: List[Tensor], raiseError: bool = False) -> bool:
@@ -37,8 +42,8 @@ def check_graph_validity(sorted_graph: List[Tensor], raiseError: bool = False) -
 
     def handleError(msg: str):
         if raiseError:
-            logger.info(__file__, "check_graph_validity() error! Graph:")
-            logger.info(__file__, graph_utils.sorted_graph_debug_str(sorted_graph))
+            _LOGGER.info("check_graph_validity() error! Graph:")
+            _LOGGER.info(graph_utils.sorted_graph_debug_str(sorted_graph))
             raise RuntimeError(msg)
         else:
             return False
@@ -222,7 +227,8 @@ def remove_view_op_from_sorted_graph(op: Operator) -> None:
     input_tensor = op._attrs["inputs"][0]
     output_tensor = op._attrs["outputs"][0]
 
-    input_tensor._attrs["dst_ops"] = output_tensor._attrs["dst_ops"]
+    input_tensor._attrs["dst_ops"].remove(op)
+    input_tensor._attrs["dst_ops"].update(output_tensor._attrs["dst_ops"])
     for dst_op in output_tensor._attrs["dst_ops"]:
         dst_op.replace_input_tensor(output_tensor, input_tensor)
     if output_tensor._attrs["is_output"]:
@@ -254,7 +260,7 @@ def sanitize_sorted_graph(sorted_graph: List[Tensor]) -> List[Tensor]:
     """
     Removes tensors whose src_op and dst_ops are empty.
     Inputs and outputs are always kept in the graph.
-    Names unamed tensors.
+    Names unnamed tensors.
     """
 
     if len(sorted_graph) == 1:
diff --git a/python/aitemplate/frontend/__init__.py b/python/aitemplate/frontend/__init__.py
index dd3562e11..a9d32e278 100644
--- a/python/aitemplate/frontend/__init__.py
+++ b/python/aitemplate/frontend/__init__.py
@@ -12,8 +12,13 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from ..compiler.base import DynamicProfileStrategy, IntImm, IntVar, Tensor  # noqa: F401
-from . import nn
-from .nn.parameter import Parameter
+from aitemplate.compiler.base import (  # noqa: F401
+    DynamicProfileStrategy,
+    IntImm,
+    IntVar,
+    Tensor,
+)
+from aitemplate.frontend import nn
+from aitemplate.frontend.nn.parameter import Parameter
 
 __all__ = ["nn", "Parameter"]
diff --git a/python/aitemplate/frontend/nn/__init__.py b/python/aitemplate/frontend/nn/__init__.py
index bb067846e..16e597a84 100644
--- a/python/aitemplate/frontend/nn/__init__.py
+++ b/python/aitemplate/frontend/nn/__init__.py
@@ -13,26 +13,34 @@
 #  limitations under the License.
 #
 # flake8: noqa
-from .container import ModuleDict, ModuleList, Sequential
-from .embedding import BertEmbeddings, Embedding
-from .module import Module
-from .conv2d import *
-from .linear import *
-from .padding import *
-from .pool2d import *
-from .fpn_proposal import FPNProposal
-from .proposal import Proposal
-from .roi_ops import *
-from .upsample import *
-from .view_ops import *
-from .attention import CrossAttention, FlashAttention, MultiheadAttention
-from .identity import Identity
-from .vanilla_attention import (
+from aitemplate.frontend.nn.container import ModuleDict, ModuleList, Sequential
+from aitemplate.frontend.nn.embedding import BertEmbeddings, Embedding
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.conv1d import *
+from aitemplate.frontend.nn.conv2d import *
+from aitemplate.frontend.nn.conv3d import *
+from aitemplate.frontend.nn.linear import *
+from aitemplate.frontend.nn.padding import *
+from aitemplate.frontend.nn.pool2d import *
+from aitemplate.frontend.nn.fpn_proposal import FPNProposal
+from aitemplate.frontend.nn.proposal import Proposal
+from aitemplate.frontend.nn.roi_ops import *
+from aitemplate.frontend.nn.upsample import *
+from aitemplate.frontend.nn.view_ops import *
+from aitemplate.frontend.nn.attention import (
+    CrossAttention,
+    FlashAttention,
+    MultiheadAttention,
+    ScaledDotProductAttention,
+)
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.multiscale_attention import MultiScaleBlock
+from aitemplate.frontend.nn.vanilla_attention import (
     vanilla_attention,
     VanillaCrossAttention,
     VanillaMultiheadAttention,
 )
-from .dropout import *
-from .layer_norm import *
-from .group_norm import *
-from .dual_gemm import T5DenseGatedGeluDense
+from aitemplate.frontend.nn.dropout import *
+from aitemplate.frontend.nn.layer_norm import *
+from aitemplate.frontend.nn.group_norm import *
+from aitemplate.frontend.nn.dual_gemm import T5DenseGatedGeluDense
diff --git a/python/aitemplate/frontend/nn/activation.py b/python/aitemplate/frontend/nn/activation.py
new file mode 100644
index 000000000..7251d2906
--- /dev/null
+++ b/python/aitemplate/frontend/nn/activation.py
@@ -0,0 +1,53 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+activation modules.
+"""
+
+from aitemplate.compiler.public import elementwise, FuncEnum
+from aitemplate.frontend.nn.module import Module
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math:: \text{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution.
+
+    When the approximate argument is 'tanh', Gelu is estimated with:
+
+    .. math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3)))
+
+    Args:
+        approximate (str, optional): the gelu approximation algorithm to use:
+            ``'none'`` | ``'tanh'``. Default: ``'none'``
+    """
+
+    def __init__(self, approximate: str = "none"):
+        super().__init__()
+        self.approximate = approximate
+
+    def forward(self, *args):
+        assert len(args) == 1
+        input_val = args[0]
+
+        # For extra speedup, lower to fast_gelu
+        if self.approximate == "tanh":
+            result = elementwise(FuncEnum.FASTGELU)(input_val)
+        else:
+            result = elementwise(FuncEnum.GELU)(input_val)
+
+        return result
diff --git a/python/aitemplate/frontend/nn/attention.py b/python/aitemplate/frontend/nn/attention.py
index 1126dd48e..d4e174f9c 100644
--- a/python/aitemplate/frontend/nn/attention.py
+++ b/python/aitemplate/frontend/nn/attention.py
@@ -15,21 +15,17 @@
 """
 Frontend for attention module
 """
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops import flash_attention
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 from aitemplate.testing import detect_target
 
-from ...compiler import ops
-from ...compiler.ops import flash_attention
-from ...compiler.ops.common.epilogue import FuncEnum
-from .. import Tensor
-from .dropout import Dropout
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
-
-# pylint: disable=C0103
-
-USE_CUDA = detect_target().name() == "cuda"
-
 
 class FlashAttention(Module):
     r"""FlashAttention provides an implementation for fused
@@ -52,7 +48,7 @@ def __init__(
         causal=False,
         dtype="float16",
     ):
-        """Initilize attention module, create a tensor for seqlen"""
+        """Initialize attention module, create a tensor for seqlen"""
         super().__init__()
         self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
         self.op = flash_attention(
@@ -83,7 +79,7 @@ class MultiheadAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
@@ -95,6 +91,8 @@ class MultiheadAttention(Module):
         mask_seq: sequence mask, default: ``0``.
     """
 
+    USE_CUDA = None
+
     def __init__(
         self,
         dim,
@@ -108,14 +106,18 @@ def __init__(
         causal=False,
         mask_seq=0,
         use_mem_eff=False,
+        dtype="float16",
     ):
         super().__init__()
         assert (
             dim % num_heads == 0
         ), f"dim {dim} should be divisible by num_heads {num_heads}"
+        if MultiheadAttention.USE_CUDA is None:
+            MultiheadAttention.USE_CUDA = detect_target().name() == "cuda"
+
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = head_dim ** -0.5
+        self.scale = head_dim**-0.5
         self.causal = causal
         self.has_residual = has_residual
         self.mask_seq = mask_seq
@@ -146,26 +148,27 @@ def __init__(
         self.cu_length = Parameter(shape=[batch_size + 1], dtype="int32")
         if self.mask_seq:
             self.output_mask = Parameter(
-                shape=[mask_seq, num_heads, head_dim], dtype="float16"
+                shape=[mask_seq, num_heads, head_dim], dtype=dtype
             )
 
-        if USE_CUDA:
+        if self.USE_CUDA:
             # on CUDA flash_attention needs packed QKV as input,
             # then do split + permute inside flash_attn
             # input: (B, S, H)
             # output: (B*S, 3, num_heads, head_dim)
             if self.use_flash:
-                self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+                self.qkv = Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype)
             else:
                 self.qkv = Linear(
                     dim,
                     dim * 3,
                     specialization="permute",
                     shape=(seq_len, 3, self.num_heads),
+                    dtype=dtype,
                 )
         else:
             # on ROCM ck attention (bmm_softmax_bmm) takes three inputs (Q, K, V)
-            # here we generate packed QKV for spliting
+            # here we generate packed QKV for splitting
             # input: (B, seqlen, dim) -> (B*seqlen, dim)
             # gemm: (B*seqlen, 3*dim)
             # reshape to: (B, seqlen, 3, num_heads, head_dim)
@@ -176,18 +179,21 @@ def __init__(
                 specialization="permute",
                 shape=(seq_len, 3, self.num_heads),
                 layout="m2n3",
+                dtype=dtype,
             )
 
-        self.attn_drop = Dropout(attn_drop)
-        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
-        self.proj_drop = Dropout(proj_drop)
+        self.attn_drop = Dropout(attn_drop, dtype=dtype)
+        self.proj = Linear(
+            dim, dim, specialization="add" if has_residual else None, dtype=dtype
+        )
+        self.proj_drop = Dropout(proj_drop, dtype=dtype)
 
     def get_shape(self, x):
         shape = [it.value() for it in x._attrs["shape"]]
         return shape
 
     def qkv_proj(self, x):
-        if USE_CUDA:
+        if self.USE_CUDA:
             if self.use_flash:
                 batch, seq, hidden = self.get_shape(x)
                 out = self.qkv(x)
@@ -204,11 +210,11 @@ def qkv_proj(self, x):
     def attention(self, x):
         # fused attention
         # output: (B, Seqlen, num_heads, head_dim)
-        if USE_CUDA and self.use_flash:
+        if self.USE_CUDA and self.use_flash:
             # input(x): (B*seqlen, 3, num_heads, head_dim)
             # output: (B, Seqlen, num_heads, head_dim)
             return self.op(x, self.cu_length.tensor())
-        elif USE_CUDA and self.use_mem_eff:
+        elif self.USE_CUDA and self.use_mem_eff:
             (q, k, v) = ops.split()(x, 1, dim=0)
             _, b, num_heads, seqlen, d = self.get_shape(q)
             return self.op(
@@ -217,13 +223,13 @@ def attention(self, x):
                 ops.reshape()(v, [b, -1, seqlen, d]),
             )
         else:
-            # intput(q/k/v): (B*num_heads, seqlen, head_dim)
+            # input(q/k/v): (B*num_heads, seqlen, head_dim)
             # attn = (B, S, H) * (B, S, H) = (B, S, S) #RCR
             # softmax on dim -1 (B, S, S)
             # attn@v: (B, S, S) * (B, S, H) = (B, S, H) #RRR
             # reshape: (B, num_head, seqlen, head_dim)
             # permute: (B, Seqlen, num_heads, head_dim)
-            if USE_CUDA:
+            if self.USE_CUDA:
                 scale = Tensor(
                     shape=[], dtype="float16", name="scale", value=self.scale
                 )
@@ -295,7 +301,7 @@ class CrossAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
@@ -318,6 +324,7 @@ def __init__(
         proj_drop=0.0,
         has_residual=True,
         causal=False,
+        dtype="float16",
     ):
         super().__init__()
         assert (
@@ -327,8 +334,6 @@ def __init__(
         self.causal = causal
         self.has_residual = has_residual
         self.dim = dim
-        self.seqlen = seq_len
-        self.seqlen_kv = seq_len_kv
 
         self.op = ops.mem_eff_attention(causal=causal)
 
@@ -336,55 +341,77 @@ def __init__(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
         self.proj_k = Linear(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
         self.proj_v = Linear(
             dim,
             dim,
             bias=qkv_bias,
+            dtype=dtype,
         )
 
-        self.attn_drop = Dropout(attn_drop)
-        self.proj = Linear(dim, dim, specialization="add" if has_residual else None)
-        self.proj_drop = Dropout(proj_drop)
-
-    def qkv_proj(self, x):
-        batch, seq, hidden = self.get_shape(x)
-        x = ops.reshape()(x, [-1, hidden])
-        return self.qkv(x)
+        self.attn_drop = Dropout(attn_drop, dtype=dtype)
+        self.proj = Linear(
+            dim, dim, specialization="add" if has_residual else None, dtype=dtype
+        )
+        self.proj_drop = Dropout(proj_drop, dtype=dtype)
 
-    def attention(self, q, k, v):
-        seqlen = self.seqlen
-        seqlen_kv = self.seqlen_kv
+    def attention(self, q, k, v, seqlens=None):
+        batch = q.shape()[0]
         head_dim = self.dim // self.num_heads
 
         query = self.proj_q(q)
         key = self.proj_k(k)
         value = self.proj_v(v)
 
-        query = ops.permute()(
-            ops.reshape()(query, [-1, seqlen, self.num_heads, head_dim]), [0, 2, 1, 3]
-        )
-        key = ops.permute()(
-            ops.reshape()(key, [-1, seqlen_kv, self.num_heads, head_dim]), [0, 2, 1, 3]
-        )
-        value = ops.permute()(
-            ops.reshape()(value, [-1, seqlen_kv, self.num_heads, head_dim]),
-            [0, 2, 1, 3],
+        if detect_target().name() == "cuda":
+            query = ops.permute()(
+                ops.reshape()(query, [batch, -1, self.num_heads, head_dim]),
+                [0, 2, 1, 3],
+            )
+            key = ops.permute()(
+                ops.reshape()(key, [batch, -1, self.num_heads, head_dim]), [0, 2, 1, 3]
+            )
+            value = ops.permute()(
+                ops.reshape()(value, [batch, -1, self.num_heads, head_dim]),
+                [0, 2, 1, 3],
+            )
+            return self.op(query, key, value)
+        elif seqlens:
+            query = ops.reshape()(query, [batch, self.num_heads, -1, head_dim])
+            key = ops.reshape()(key, [batch, self.num_heads, -1, head_dim])
+            value = ops.reshape()(value, [batch, self.num_heads, -1, head_dim])
+            return self.op(query, key, value, seqlens)
+
+        query = ops.reshape()(query, [batch, -1, self.num_heads, head_dim])
+        query = ops.transpose()(query, 1, 2)
+        query = ops.reshape()(query, [-1, query.shape()[2], head_dim])
+        key = ops.reshape()(key, [batch, -1, self.num_heads, head_dim])
+        key = ops.transpose()(key, 1, 2)
+        key = ops.reshape()(key, [-1, key.shape()[2], head_dim])
+        value = ops.reshape()(value, [batch, -1, self.num_heads, head_dim])
+        value = ops.transpose()(value, 1, 2)
+        value = ops.reshape()(value, [-1, value.shape()[2], head_dim])
+        OP = ops.bmm_softmax_bmm_permute(
+            shape=(self.num_heads,),
+            scale=head_dim**-0.5,
+            causal=self.causal,
         )
-        return self.op(query, key, value)
+        return OP(query, key, value)
 
-    def forward(self, *args):
+    def forward(self, *args, seqlens=None):
         """forward pass for calling mha module"""
         assert len(args) >= 3
         x = args[0]
-        seq = self.seqlen
-        attn_output = self.attention(args[0], args[1], args[2])
-        attn_output = ops.reshape()(attn_output, [-1, seq, self.dim])
+        batch = x.shape()[0]
+        attn_output = self.attention(args[0], args[1], args[2], seqlens=seqlens)
+        attn_output = ops.reshape()(attn_output, [batch, -1, self.dim])
 
         if self.has_residual:
             assert len(args) == 4
@@ -392,5 +419,15 @@ def forward(self, *args):
         else:
             x = self.proj(attn_output)
         x = self.proj_drop(x)
-        x = ops.reshape()(x, [-1, seq, self.dim])
+        if not isinstance(batch, IntVar):
+            x = ops.reshape()(x, [batch, -1, self.dim])
         return x
+
+
+class ScaledDotProductAttention(Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, q, k, v):
+        attn = ops.mem_eff_attention(causal=False)(q, k, v)
+        return attn
diff --git a/python/aitemplate/frontend/nn/batch_norm.py b/python/aitemplate/frontend/nn/batch_norm.py
new file mode 100644
index 000000000..823954b4a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/batch_norm.py
@@ -0,0 +1,149 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for attention module
+"""
+from aitemplate.compiler.public import elementwise, FuncEnum, permute
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+
+class _BatchNorm(Module):
+    """BatchNorm nn module"""
+
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        permute_input_output=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.dim = (num_features,)
+        self.dtype = dtype
+        self.num_features = num_features
+        self.permute_input_output = permute_input_output
+        self.eps = eps
+        self.weight = Parameter(shape=self.dim, dtype=dtype)
+        self.bias = Parameter(shape=self.dim, dtype=dtype)
+        self.running_mean = Parameter(shape=self.dim, dtype=dtype)
+        self.running_var = Parameter(shape=self.dim, dtype=dtype)
+        # Placeholder for setting constants, won't be used
+        self.num_batches_tracked = Parameter(shape=[], value=0, dtype=dtype)
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        self._check_input_dim(x)
+        x = self._convert_input(x) if self.permute_input_output else x
+
+        x_normalized = elementwise(FuncEnum.DIV)(
+            elementwise(FuncEnum.SUB)(x, self.running_mean.tensor()),
+            elementwise(FuncEnum.SQRT)(
+                elementwise(FuncEnum.ADD)(self.running_var.tensor(), self.eps)
+            ),
+        )
+
+        y = elementwise(FuncEnum.ADD)(
+            elementwise(FuncEnum.MUL)(self.weight.tensor(), x_normalized),
+            self.bias.tensor(),
+        )
+
+        y = self._convert_output(y) if self.permute_input_output else y
+        return y
+
+    def _check_input_dim(self):
+        raise NotImplementedError()
+
+    def _convert_input(self):
+        raise NotImplementedError()
+
+    def _convert_output(self):
+        raise NotImplementedError()
+
+
+class BatchNorm1d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        permute_input_output=False,
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 2 and len(x.shape()) != 3:
+            raise ValueError(
+                "expected 2D or 3D input (got {}D input)".format(x.shape())
+            )
+
+    def _convert_input(self, x):
+        if len(x.shape()) == 3:
+            return permute()(x, [0, 2, 1])
+        else:
+            return x
+
+    def _convert_output(self, y):
+        if len(y.shape()) == 3:
+            return permute()(y, [0, 2, 1])
+        else:
+            return y
+
+
+class BatchNorm2d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        permute_input_output=False,
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 4:
+            raise ValueError("expected 4D input (got {}D input)".format(x.shape()))
+
+    def _convert_input(self, x):
+        return permute()(x, [0, 2, 3, 1])
+
+    def _convert_output(self, y):
+        return permute()(y, [0, 3, 1, 2])
+
+
+class BatchNorm3d(_BatchNorm):
+    def __init__(
+        self,
+        num_features,
+        eps=1e-5,
+        dtype="float16",
+        permute_input_output=False,
+        **kwargs,
+    ):
+        super().__init__(num_features, eps, dtype, permute_input_output, **kwargs)
+
+    def _check_input_dim(self, x):
+        if len(x.shape()) != 5:
+            raise ValueError("expected 5D input (got {}D input)".format(x.shape()))
+
+    def _convert_input(self, x):
+        return permute()(x, [0, 2, 3, 4, 1])
+
+    def _convert_output(self, y):
+        return permute()(y, [0, 4, 1, 2, 3])
diff --git a/python/aitemplate/frontend/nn/container.py b/python/aitemplate/frontend/nn/container.py
index da1f0381e..78d13be7f 100644
--- a/python/aitemplate/frontend/nn/container.py
+++ b/python/aitemplate/frontend/nn/container.py
@@ -29,10 +29,10 @@
     Union,
 )
 
-from ...compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
-from .module import Module, typename
-from .parameter import Parameter
+from aitemplate.frontend.nn.module import Module, typename
+from aitemplate.frontend.nn.parameter import Parameter
 
 __all__ = ["Sequential", "ModuleList", "ModuleDict", "ParameterList", "ParameterDict"]
 
@@ -131,7 +131,7 @@ def __delitem__(self, idx: Union[slice, int]) -> None:
             delattr(self, key)
         # To preserve numbering
         str_indices = [str(i) for i in range(len(self._modules))]
-        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+        self._modules = OrderedDict(zip(str_indices, self._modules.values()))
 
     def __len__(self) -> int:
         return len(self._modules)
@@ -309,7 +309,7 @@ def __delitem__(self, idx: Union[int, slice]) -> None:
             delattr(self, self._get_abs_string_index(idx))
         # To preserve numbering, self._modules is being reconstructed with modules after deletion
         str_indices = [str(i) for i in range(len(self._modules))]
-        self._modules = OrderedDict(list(zip(str_indices, self._modules.values())))
+        self._modules = OrderedDict(zip(str_indices, self._modules.values()))
 
     def __len__(self) -> int:
         return len(self._modules)
diff --git a/python/aitemplate/frontend/nn/conv1d.py b/python/aitemplate/frontend/nn/conv1d.py
new file mode 100644
index 000000000..1ce285bd2
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv1d.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Conv1d Module.
+"""
+from aitemplate.compiler.ops import conv2d, conv2d_bias, squeeze, unsqueeze
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+
+class Conv1d(Module):
+    r"""
+    Conv1d module applies a 1D convolution over an input signal composed of several input planes.
+
+    .. math::
+        \text{out}\left(B_i, \text{:}, \text{channels\_out}_j\right) = \text{bias}\left(\text{channels\_out}_j\right) +
+        \sum_{k = 0}^{\text{channels\_in} - 1} \text{weight}\left(\text{channels\_out}_j, \text{:}, k\right)
+        \star \text{input}\left(B_i, \text{:}, k\right)
+
+    The semantics are similar to `PyTorch`_ with the following exception:
+    dims 1 and 2 of the weight, input and output are swapped (while dim 0 remains the same).
+
+    .. _PyTorch:
+        https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        dtype: str = "float16",
+        bias: bool = False,
+        name: str = "conv1d",
+    ):
+        super().__init__()
+
+        self.weight = Parameter(
+            shape=[out_channels, kernel_size, in_channels // groups],
+            dtype=dtype,
+            name=f"{name}_weight",
+        )
+        if bias:
+            self.bias = Parameter(
+                shape=[out_channels], dtype=dtype, name=f"{name}_bias"
+            )
+        else:
+            self.bias = None
+
+        # note that conv1d is functionally equivalent to conv2d,
+        # but we need to reshape the input, weight and output tensors,
+        # as well as use the correct stride, padding and dilation for the conv2d op.
+        fwd_func = conv2d_bias if bias else conv2d
+        self.op = fwd_func(
+            stride=(stride, 1), pad=(padding, 0), dilate=(dilation, 1), group=groups
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Applies Conv1d on the input tensor of shape :math:`(B, \text{seq\_in}, \text{channels\_in})`.
+        The output has shape :math:`(B, \text{seq\_out}, \text{channels\_out})`, where
+        .. math::
+            \text{seq\_out} = \left\lfloor\frac{\text{seq\_in} + 2 \times \text{padding} - \text{dilation}
+                             \times (\text{kernel\_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+        """
+        # make the conv2d inputs 4d
+        xu = unsqueeze(dim=2)(x)
+        wu = unsqueeze(dim=2)(self.weight.tensor())
+        if self.bias is None:
+            c2d = self.op(xu, wu)
+        else:
+            c2d = self.op(xu, wu, self.bias.tensor())
+        # make the result 3d again
+        return squeeze(dim=2)(c2d)
diff --git a/python/aitemplate/frontend/nn/conv2d/__init__.py b/python/aitemplate/frontend/nn/conv2d/__init__.py
index 79375c8f1..50d8b0dd2 100644
--- a/python/aitemplate/frontend/nn/conv2d/__init__.py
+++ b/python/aitemplate/frontend/nn/conv2d/__init__.py
@@ -16,17 +16,25 @@
 """
 modules for conv2d
 """
-from .conv2d import Conv2d
-from .conv2d_bias import Conv2dBias
-from .conv2d_bias_add_hardswish import Conv2dBiasAddHardswish
-from .conv2d_bias_add_relu import Conv2dBiasAddRelu
-from .conv2d_bias_few_channels import Conv2dBiasFewChannels
-from .conv2d_bias_hardswish import Conv2dBiasHardswish
-from .conv2d_bias_hardswish_few_channels import Conv2dBiasHardswishFewChannels
-from .conv2d_bias_relu import Conv2dBiasRelu
-from .conv2d_bias_relu_few_channels import Conv2dBiasReluFewChannels
-from .conv2d_bias_sigmoid import Conv2dBiasSigmoid
-from .conv2d_depthwise import Conv2dDepthwise
-from .conv2d_depthwise_bias import Conv2dDepthwiseBias
-from .transposed_conv2d_bias import ConvTranspose2dBias
-from .transposed_conv2d_bias_relu import ConvTranspose2dBiasRelu
+from aitemplate.frontend.nn.conv2d.conv2d import Conv2d
+from aitemplate.frontend.nn.conv2d.conv2d_bias import Conv2dBias
+from aitemplate.frontend.nn.conv2d.conv2d_bias_add_hardswish import (
+    Conv2dBiasAddHardswish,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_add_relu import Conv2dBiasAddRelu
+from aitemplate.frontend.nn.conv2d.conv2d_bias_few_channels import Conv2dBiasFewChannels
+from aitemplate.frontend.nn.conv2d.conv2d_bias_hardswish import Conv2dBiasHardswish
+from aitemplate.frontend.nn.conv2d.conv2d_bias_hardswish_few_channels import (
+    Conv2dBiasHardswishFewChannels,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_relu import Conv2dBiasRelu
+from aitemplate.frontend.nn.conv2d.conv2d_bias_relu_few_channels import (
+    Conv2dBiasReluFewChannels,
+)
+from aitemplate.frontend.nn.conv2d.conv2d_bias_sigmoid import Conv2dBiasSigmoid
+from aitemplate.frontend.nn.conv2d.conv2d_depthwise import Conv2dDepthwise
+from aitemplate.frontend.nn.conv2d.conv2d_depthwise_bias import Conv2dDepthwiseBias
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias import ConvTranspose2dBias
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_relu import (
+    ConvTranspose2dBiasRelu,
+)
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
index 276a360a7..1a57137ad 100644
--- a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
@@ -37,7 +37,7 @@ def __init__(
         groups=1,
         dtype="float16",
     ):
-        """initilize the Conv2dBiasAct class
+        """Initialize the Conv2dBiasAct class
 
         Parameters
         ----------
diff --git a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
index a08a4abf5..687a3e676 100644
--- a/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/common_conv2d_bias_add_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv2d bias act residual add
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d.py b/python/aitemplate/frontend/nn/conv2d/conv2d.py
index fa1f1d0da..1b78611cf 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d.py
@@ -15,9 +15,9 @@
 """
 conv2d Module.
 """
-from ....compiler.ops import conv2d
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler.ops import conv2d
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
index b3b99fae6..2a1e0779e 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias.py
@@ -15,7 +15,7 @@
 """
 conv2d bias module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBias(Conv2dBiasAct):
@@ -65,7 +65,7 @@ def __init__(
         in_channels,
         out_channels,
         kernel_size,
-        stride,
+        stride=1,
         padding=0,
         dilation=1,
         groups=1,
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
index 343780b53..046f9b589 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_hardswish.py
@@ -15,7 +15,7 @@
 """
 conv2d + bias + residual + hardswish
 """
-from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_add_act import Conv2dBiasAddAct
 
 
 class Conv2dBiasAddHardswish(Conv2dBiasAddAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
index f12c7a3ec..99a779ab1 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_add_relu.py
@@ -15,7 +15,7 @@
 """
 General template module for conv2e + bias + residual + relu
 """
-from .common_conv2d_bias_add_act import Conv2dBiasAddAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_add_act import Conv2dBiasAddAct
 
 
 class Conv2dBiasAddRelu(Conv2dBiasAddAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
index f7494d54f..36cb07963 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
index 89ccdd94f..55662e4f6 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish.py
@@ -15,7 +15,7 @@
 """
 conv bias hardswish module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasHardswish(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
index c6b6e4d0d..8cf6c3033 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_hardswish_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias hardswish module for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasHardswishFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
index 197ce60ce..25e02abb9 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu.py
@@ -15,7 +15,7 @@
 """
 conv2d bias relu module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasRelu(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
index 214ae2726..56a2eb8fb 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_relu_few_channels.py
@@ -15,7 +15,7 @@
 """
 conv2d bias relu for few channels
 """
-from .special_conv2d_bias_act import SpecialConv2dBiasAct
+from aitemplate.frontend.nn.conv2d.special_conv2d_bias_act import SpecialConv2dBiasAct
 
 
 class Conv2dBiasReluFewChannels(SpecialConv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
index 51c6eb839..65077f4c4 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_bias_sigmoid.py
@@ -15,7 +15,7 @@
 """
 conv2d bias sigmoid module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dBiasSigmoid(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
index 93b95927c..6968c22e6 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise.py
@@ -15,8 +15,8 @@
 """
 conv2d depthwise module
 """
-from ....compiler.ops import conv2d_depthwise
-from .conv2d import Conv2d
+from aitemplate.compiler.ops import conv2d_depthwise
+from aitemplate.frontend.nn.conv2d.conv2d import Conv2d
 
 
 class Conv2dDepthwise(Conv2d):
diff --git a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
index 6632db113..129b491d4 100644
--- a/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/conv2d_depthwise_bias.py
@@ -15,7 +15,7 @@
 """
 conv2d depthwise bias module
 """
-from .common_conv2d_bias_act import Conv2dBiasAct
+from aitemplate.frontend.nn.conv2d.common_conv2d_bias_act import Conv2dBiasAct
 
 
 class Conv2dDepthwiseBias(Conv2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
index 63d9751dc..d713908f9 100644
--- a/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/special_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for conv_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
index 8fc7e6c45..a298478c1 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias.py
@@ -12,7 +12,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_act import (
+    ConvTranspose2dBiasAct,
+)
 
 
 class ConvTranspose2dBias(ConvTranspose2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
index 368c64922..628932729 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_act.py
@@ -15,9 +15,9 @@
 """
 common module for ConvTranspose2d_bias_act subgraph
 """
-from ....compiler import ops
-from ..module import Module
-from ..parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
@@ -37,7 +37,7 @@ def __init__(
         groups=1,
         dtype="float16",
     ):
-        """initilize the ConvTranspose2dBiasAct class
+        """Initialize the ConvTranspose2dBiasAct class
 
         Parameters
         ----------
diff --git a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
index a2d89c848..079ed7b57 100644
--- a/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
+++ b/python/aitemplate/frontend/nn/conv2d/transposed_conv2d_bias_relu.py
@@ -15,7 +15,9 @@
 """
 conv2d bias relu module
 """
-from .transposed_conv2d_bias_act import ConvTranspose2dBiasAct
+from aitemplate.frontend.nn.conv2d.transposed_conv2d_bias_act import (
+    ConvTranspose2dBiasAct,
+)
 
 
 class ConvTranspose2dBiasRelu(ConvTranspose2dBiasAct):
diff --git a/python/aitemplate/frontend/nn/conv3d.py b/python/aitemplate/frontend/nn/conv3d.py
new file mode 100644
index 000000000..ea4256e46
--- /dev/null
+++ b/python/aitemplate/frontend/nn/conv3d.py
@@ -0,0 +1,133 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+conv3d Module.
+"""
+from aitemplate.compiler.ops import conv3d, conv3d_bias, depthwise_conv3d
+from aitemplate.compiler.ops.padding.ndhwc3to8 import ndhwc3to8
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+# pylint: disable=C0103
+
+
+class Conv3d(Module):
+    r"""Applies a 3D convolution over an input signal composed of several input
+    planes.
+
+    * :attr:`stride` controls the stride for the cross-correlation.
+
+    * :attr:`padding` controls the amount of padding applied to the input.
+
+    * :attr:`dilation` controls the spacing between the kernel points; also
+      known as the à trous algorithm. It is harder to describe, but this `link`_
+      has a nice visualization of what :attr:`dilation` does.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or Tuple(int)): Size of the convolving kernel
+        stride (int or Tuple(int)): Stride of the convolution
+        padding (int or Tuple(int), optional): Padding added to all four sides of
+            the input. Default: 0
+        dilation (int or Tuple(int), optional): Spacing between kernel elements. Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        dtype (string, optional): Data type. Default: "float16"
+        bias (bool, optional): Has bias or not. Default: False (Note that we only support bias for depthwise_conv3d for now)
+
+    Shape:
+        - Input: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})`
+        - Output: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})`, where
+
+          .. math::
+              D_{out} = \left\lfloor\frac{D_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              H_{out} = \left\lfloor\frac{H_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+          .. math::
+              W_{out} = \left\lfloor\frac{W_{in}  + 2 \times \text{padding} - \text{dilation}
+                        \times (\text{kernel_size} - 1) - 1}{\text{stride}} + 1\right\rfloor
+
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape
+            :math:`(\text{out_channels}, \text{kernel_size}[0], \text{kernel_size}[1], \text{kernel_size}[2], `
+            :math:`\frac{\text{in_channels}}{\text{groups}})`.
+
+    Examples::
+
+        >>> m = nn.Conv3d(16, 33, 3, 2)
+        >>> input = Tensor(shape=[20, 50, 100, 100, 16])
+        >>> output = m(input)
+
+    .. _cross-correlation:
+        https://en.wikipedia.org/wiki/Cross-correlation
+
+    .. _link:
+        https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding=0,
+        dilation=1,
+        groups=1,
+        dtype="float16",
+        bias=False,
+    ):
+        super().__init__()
+        self.has_bias = bias
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size, kernel_size)
+        self.weight = Parameter(
+            shape=[out_channels, *kernel_size, in_channels // groups],
+            dtype=dtype,
+        )
+        if self.has_bias:
+            self.bias = Parameter(shape=[out_channels], dtype=dtype)
+
+        if groups == 1:
+            if self.has_bias:
+                self.op = conv3d_bias(
+                    stride=stride, pad=padding, dilate=dilation, group=groups
+                )
+            else:
+                self.op = conv3d(
+                    stride=stride, pad=padding, dilate=dilation, group=groups
+                )
+        else:
+            self.op = depthwise_conv3d(
+                stride=stride, pad=padding, dilate=dilation, group=groups, bias=bias
+            )
+
+    def forward(self, *args):
+        """Applies Conv3d on the input tensor."""
+        assert len(args) == 1
+        x = args[0]
+
+        if self.has_bias:
+            x = ndhwc3to8()(x)
+            weight = ndhwc3to8()(self.weight.tensor())
+            return self.op(x, weight, self.bias.tensor())
+        else:
+            return self.op(x, self.weight.tensor())
diff --git a/python/aitemplate/frontend/nn/dropout.py b/python/aitemplate/frontend/nn/dropout.py
index 353ae44d7..91874de30 100644
--- a/python/aitemplate/frontend/nn/dropout.py
+++ b/python/aitemplate/frontend/nn/dropout.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 """Dropout/DropPath placeholder"""
-from .module import Module
+from aitemplate.frontend.nn.module import Module
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/dual_gemm.py b/python/aitemplate/frontend/nn/dual_gemm.py
index 1db963eab..17c84e5f7 100644
--- a/python/aitemplate/frontend/nn/dual_gemm.py
+++ b/python/aitemplate/frontend/nn/dual_gemm.py
@@ -15,10 +15,10 @@
 """
 Frontend for attention module
 """
-from ...compiler import ops
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
@@ -33,7 +33,7 @@ def __init__(
         fast_gelu=True,
         dtype="float16",
     ):
-        """Initilize dual gemm module, create a tensor for weights"""
+        """Initialize dual gemm module, create a tensor for weights"""
         super().__init__()
         self.w1 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
         self.w2 = Parameter(shape=[out_channels, in_channels], dtype=dtype)
@@ -59,9 +59,20 @@ def __init__(
         dtype="float16",
     ):
         super().__init__()
-        self.wi_0_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
-        self.wi_1_weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
-        self.wo = Linear(out_channels, in_channels, bias=False)
+        self.wi_0_weight = Parameter(
+            shape=[out_channels, in_channels],
+            dtype=dtype,
+        )
+        self.wi_1_weight = Parameter(
+            shape=[out_channels, in_channels],
+            dtype=dtype,
+        )
+        self.wo = Linear(
+            out_channels,
+            in_channels,
+            bias=False,
+            dtype=dtype,
+        )
         self.op = ops.dual_gemm_rcr_fast_gelu()
 
     def forward(self, *args):
diff --git a/python/aitemplate/frontend/nn/embedding.py b/python/aitemplate/frontend/nn/embedding.py
index 9129b4a56..fc0b29b55 100644
--- a/python/aitemplate/frontend/nn/embedding.py
+++ b/python/aitemplate/frontend/nn/embedding.py
@@ -12,12 +12,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-
-from ...compiler import ops
-from .dropout import Dropout
-from .layer_norm import LayerNorm
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.layer_norm import LayerNorm
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 
 class Embedding(Module):
@@ -47,6 +46,8 @@ def tensor(self):
 class BertEmbeddings(Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
+    USE_CUDA = None
+
     def __init__(
         self,
         hidden_size,
diff --git a/python/aitemplate/frontend/nn/fpn_proposal.py b/python/aitemplate/frontend/nn/fpn_proposal.py
index 8645a82a2..3f4f12e8f 100644
--- a/python/aitemplate/frontend/nn/fpn_proposal.py
+++ b/python/aitemplate/frontend/nn/fpn_proposal.py
@@ -17,9 +17,9 @@
 """
 import numpy as np
 
-from ...compiler import ops
-from ...compiler.base import Tensor
-from .proposal import generate_shifted_anchors, Proposal
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.frontend.nn.proposal import generate_shifted_anchors, Proposal
 
 
 def generate_fpn_anchors(im_h, im_w, feat_strides, scales, ratios, batch_size, dtype):
diff --git a/python/aitemplate/frontend/nn/group_norm.py b/python/aitemplate/frontend/nn/group_norm.py
index af8ea7a53..4d93a3d06 100644
--- a/python/aitemplate/frontend/nn/group_norm.py
+++ b/python/aitemplate/frontend/nn/group_norm.py
@@ -15,9 +15,9 @@
 """
 GroupNorm module
 """
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/head.py b/python/aitemplate/frontend/nn/head.py
new file mode 100644
index 000000000..10ebb4c21
--- /dev/null
+++ b/python/aitemplate/frontend/nn/head.py
@@ -0,0 +1,177 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+from typing import Callable
+
+from aitemplate.compiler import ops
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.softmax import Softmax
+
+
+class SequencePool(Module):
+    """
+    Sequence pool produces a single embedding from a sequence of embeddings. Currently
+    it supports "mean" and "cls".
+
+    """
+
+    def __init__(self, mode: str) -> None:
+        """
+        Args:
+            mode (str): Optionals include "cls" and "mean". If set to "cls", it assumes
+                the first element in the input is the cls token and returns it. If set
+                to "mean", it returns the mean of the entire sequence.
+        """
+        super().__init__()
+        assert mode in ["mean"], "Unsupported mode for SequencePool."
+        self.mode = mode
+
+    def forward(self, x: Tensor) -> Tensor:
+        # TODO: Add support for cls mode.
+        # if self.mode == "cls":
+        #     x = x[:, 0]
+        if self.mode == "mean":
+            x = ops.reduce_mean(1)(x)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class VisionTransformerBasicHead(Module):
+    """
+    Vision transformer basic head.
+
+    ::
+
+                                      SequencePool
+                                           ↓
+                                        Dropout
+                                           ↓
+                                       Projection
+                                           ↓
+                                       Activation
+
+
+    The builder can be found in `create_vit_basic_head`.
+    """
+
+    def __init__(
+        self,
+        sequence_pool: Module = None,
+        dropout: Module = None,
+        proj: Module = None,
+        activation: Module = None,
+    ) -> None:
+        """
+        Args:
+            sequence_pool (torch.nn.modules): pooling module.
+            dropout(torch.nn.modules): dropout module.
+            proj (torch.nn.modules): project module.
+            activation (torch.nn.modules): activation module.
+        """
+        super().__init__()
+        self.sequence_pool = sequence_pool
+        self.dropout = dropout
+        self.proj = proj
+        self.activation = activation
+
+    def forward(self, x: Tensor) -> Tensor:
+        # Performs pooling.
+        if self.sequence_pool is not None:
+            x = self.sequence_pool(x)
+
+        # Performs dropout.
+        if self.dropout is not None:
+            x = self.dropout(x)
+        # Performs projection.
+        if self.proj is not None:
+            x = self.proj(x)
+        # Performs activation.
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+def create_vit_basic_head(
+    *,
+    # Projection configs.
+    in_features: int,
+    out_features: int,
+    # Pooling configs.
+    seq_pool_type: str = "cls",
+    # Dropout configs.
+    dropout_rate: float = 0.5,
+    # Activation configs.
+    activation: Callable = None,
+) -> Module:
+    """
+    Creates vision transformer basic head.
+
+    ::
+
+
+                                        Pooling
+                                           ↓
+                                        Dropout
+                                           ↓
+                                       Projection
+                                           ↓
+                                       Activation
+
+
+    Activation examples include: ReLU, Softmax, Sigmoid, and None.
+    Pool type examples include: cls, mean and none.
+
+    Args:
+
+        in_features: input channel size of the resnet head.
+        out_features: output channel size of the resnet head.
+
+        pool_type (str): Pooling type. It supports "cls", "mean " and "none". If set to
+            "cls", it assumes the first element in the input is the cls token and
+            returns it. If set to "mean", it returns the mean of the entire sequence.
+
+        activation (callable): a callable that constructs vision transformer head
+            activation layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and
+            None (not applying activation).
+
+        dropout_rate (float): dropout rate.
+    """
+    assert seq_pool_type in ["cls", "mean", "none"]
+
+    if seq_pool_type in ["cls", "mean"]:
+        seq_pool_model = SequencePool(seq_pool_type)
+    elif seq_pool_type == "none":
+        seq_pool_model = None
+    else:
+        raise NotImplementedError
+
+    if activation is None:
+        activation_model = None
+    elif activation == Softmax:
+        activation_model = activation(dim=1)
+    else:
+        activation_model = activation()
+
+    return VisionTransformerBasicHead(
+        sequence_pool=seq_pool_model,
+        dropout=Dropout(dropout_rate) if dropout_rate > 0.0 else None,
+        proj=Linear(in_features, out_features),
+        activation=activation_model,
+    )
diff --git a/python/aitemplate/frontend/nn/identity.py b/python/aitemplate/frontend/nn/identity.py
index ac51ae53d..31d1efb68 100644
--- a/python/aitemplate/frontend/nn/identity.py
+++ b/python/aitemplate/frontend/nn/identity.py
@@ -15,13 +15,13 @@
 """
 Identity module.
 """
-from .module import Module
+from aitemplate.frontend.nn.module import Module
 
 # pylint: disable=C0103
 
 
 class Identity(Module):
-    """The identify of the input."""
+    """The identity of the input."""
 
     def __init__(
         self,
diff --git a/python/aitemplate/frontend/nn/layer_norm.py b/python/aitemplate/frontend/nn/layer_norm.py
index 8b6f9988e..90331baae 100644
--- a/python/aitemplate/frontend/nn/layer_norm.py
+++ b/python/aitemplate/frontend/nn/layer_norm.py
@@ -15,9 +15,9 @@
 """
 LayerNorm module.
 """
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
diff --git a/python/aitemplate/frontend/nn/linear.py b/python/aitemplate/frontend/nn/linear.py
index a50f27847..a6a6e1793 100644
--- a/python/aitemplate/frontend/nn/linear.py
+++ b/python/aitemplate/frontend/nn/linear.py
@@ -15,16 +15,11 @@
 """
 Linear module.
 """
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 from aitemplate.testing import detect_target
 
-from ...compiler import ops
-from .module import Module
-from .parameter import Parameter
-
-# pylint: disable=C0103
-
-USE_CUDA = detect_target().name() == "cuda"
-
 
 class Linear(Module):
     r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
@@ -64,6 +59,8 @@ class Linear(Module):
         Tensor(shape=[128, 30])
     """
 
+    USE_CUDA = None
+
     def __init__(
         self,
         in_channels,
@@ -74,6 +71,8 @@ def __init__(
         **kwargs,
     ):
         super().__init__()
+        if Linear.USE_CUDA is None:
+            Linear.USE_CUDA = detect_target().name() == "cuda"
         self.weight = Parameter(shape=[out_channels, in_channels], dtype=dtype)
         op_name = "gemm_rcr_bias" if bias else "gemm_rcr"
         if specialization is not None:
@@ -89,19 +88,12 @@ def __init__(
     def forward(self, *args):
         assert len(args) >= 1
         x = args[0]
-        if not USE_CUDA:
-            shape = x._attrs["shape"]
-            x = x if len(shape) == 2 else ops.reshape()(x, [-1, self.in_channels])
+        if not self.USE_CUDA and len(x._attrs["shape"]) != 2:
+            x = ops.reshape()(x, [-1, self.in_channels])
+        inputs = [x, self.weight.tensor()]
+        if self.use_bias:
+            inputs.append(self.bias.tensor())
         if len(args) == 2:
-            if self.use_bias:
-                inputs = [x, self.weight.tensor(), self.bias.tensor(), args[1]]
-            else:
-                inputs = [x, self.weight.tensor(), args[1]]
-            output = self.op(*inputs)
-            return output
-        output = (
-            self.op(x, self.weight.tensor(), bias=self.bias.tensor())
-            if self.use_bias
-            else self.op(x, self.weight.tensor())
-        )
+            inputs.append(args[1])
+        output = self.op(*inputs)
         return output
diff --git a/python/aitemplate/frontend/nn/module.py b/python/aitemplate/frontend/nn/module.py
index ae02926a0..391d9d5d7 100644
--- a/python/aitemplate/frontend/nn/module.py
+++ b/python/aitemplate/frontend/nn/module.py
@@ -15,8 +15,8 @@
 from collections import namedtuple, OrderedDict
 from typing import Any, Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
 
-from ...compiler.base import Tensor
-from .parameter import Parameter
+from aitemplate.compiler.base import Tensor
+from aitemplate.frontend.nn.parameter import Parameter
 
 
 class _IncompatibleKeys(
@@ -296,7 +296,6 @@ def get_submodule(self, target: str) -> "Module":
         mod: Module = self
 
         for item in atoms:
-
             if not hasattr(mod, item):
                 raise AttributeError(
                     mod._get_name() + " has no " "attribute `" + item + "`"
diff --git a/python/aitemplate/frontend/nn/multiscale_attention.py b/python/aitemplate/frontend/nn/multiscale_attention.py
new file mode 100644
index 000000000..53fe02300
--- /dev/null
+++ b/python/aitemplate/frontend/nn/multiscale_attention.py
@@ -0,0 +1,753 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Frontend for multi-scale attention module
+AIT implementation for MViT:
+https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/models/vision_transformers.py
+"""
+
+import logging
+from typing import Callable, List, Optional, Tuple
+
+import numpy
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.public import permute
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.activation import GELU
+from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.dropout import Dropout, DropPath
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.layer_norm import LayerNorm
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.pool3d import MaxPool3d
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def get_shape(x):
+    shape = [it.value() for it in x._attrs["shape"]]
+    return shape
+
+
+def ait_ncl2nlc(x):
+    return permute()(x, [0, 2, 1])
+
+
+def _unsqueeze_dims(x):
+    tensor_dim = len(get_shape(x))
+    if tensor_dim == 4:
+        pass
+    elif tensor_dim == 3:
+        x = ops.unsqueeze(dim=1)(x)
+    else:
+        raise NotImplementedError(f"Unsupported input dimension {get_shape(x)}")
+    return x, tensor_dim
+
+
+def _squeeze_dims(x, tensor_dim):
+    if tensor_dim == 4:
+        pass
+    elif tensor_dim == 3:
+        x = ops.squeeze(dim=1)(x)
+    else:
+        raise NotImplementedError(f"Unsupported input dimension {get_shape(x)}")
+    return x
+
+
+class Mlp(Module):
+    """
+    A MLP block that contains two linear layers with a normalization layer. The MLP
+    block is used in a transformer model after the attention block.
+
+    ::
+
+                         Linear (in_features, hidden_features)
+                                           ↓
+                                 Normalization (act_layer)
+                                           ↓
+                                Dropout (p=dropout_rate)
+                                           ↓
+                         Linear (hidden_features, out_features)
+                                           ↓
+                                Dropout (p=dropout_rate)
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Module = GELU,
+        dropout_rate: float = 0.0,
+        bias_on: bool = True,
+    ) -> None:
+        """
+        Args:
+            in_features (int): Input feature dimension.
+            hidden_features (Optional[int]): Hidden feature dimension. By default,
+                hidden feature is set to input feature dimension.
+            out_features (Optional[int]): Output feature dimension. By default, output
+                features dimension is set to input feature dimension.
+            act_layer (Callable): Activation layer used after the first linear layer.
+            dropout_rate (float): Dropout rate after each linear layer. Dropout is not used
+                by default.
+        """
+        super().__init__()
+        self.dropout_rate = dropout_rate
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        # TODO fc1 bias is set to zeros; unset if bias_on is True
+
+        self.fc1 = Linear(
+            in_features,
+            hidden_features,
+            bias=bias_on,
+        )
+        self.act = act_layer()
+        self.fc2 = Linear(hidden_features, out_features, bias=bias_on)
+
+        if self.dropout_rate > 0.0:
+            self.dropout = Dropout(dropout_rate)
+        else:
+            self.dropout = Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (tensor): Input tensor.
+        """
+        x = self.fc1(x)
+        x = self.act(x)
+
+        assert self.dropout_rate == 0.0
+
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        x = self.fc2(x)
+
+        if self.dropout_rate > 0.0:
+            x = self.dropout(x)
+
+        return x
+
+
+class _AttentionPool(Module):
+    def __init__(
+        self,
+        pool: Optional[Module],
+        has_cls_embed: bool,
+        norm: Optional[Module],
+    ) -> None:
+        """Apply pool to a flattened input (given pool operation and the unflattened shape).
+
+
+                                         Input
+                                           ↓
+                                        Reshape
+                                           ↓
+                                          Pool
+                                           ↓
+                                        Reshape
+                                           ↓
+                                          Norm
+
+
+        Params:
+            pool (Optional[Callable]): Pool operation that is applied to the input tensor.
+                If pool is none, return the input tensor.
+            has_cls_embed (bool): Whether the input tensor contains cls token. Pool
+                operation excludes cls token.
+            norm: (Optional[Callable]): Optional normalization operation applied to
+            tensor after pool.
+        """
+        super().__init__()
+        self.has_pool = pool is not None
+        self.pool = pool if pool is not None else Identity()
+
+        self.has_cls_embed = has_cls_embed
+        if norm is not None:
+            self.norm_before_pool = isinstance(norm, (BatchNorm3d, Identity))
+            self.has_norm = True
+            self.norm = norm
+        else:
+            self.norm_before_pool = False
+            self.has_norm = False
+            self.norm = Identity
+
+    def forward(self, tensor: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            tensor (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+
+        Returns:
+            tensor (Tensor): Input tensor after pool.
+            thw_shape (List[int]): Output tensor shape (before flattening).
+        """
+        if not self.has_pool:
+            return tensor, thw_shape
+
+        tensor, tensor_dim = _unsqueeze_dims(tensor)
+
+        assert not self.has_cls_embed
+
+        if self.has_cls_embed:
+            # TODO: enable has_cls_embed
+
+            # cls_tok: Tensor = torch.tensor(0)  # For typing/torchscriptability
+            # if self.has_cls_embed:
+            #    cls_tok, tensor = tensor[:, :, :1, :], tensor[:, :, 1:, :]
+            raise NotImplementedError("Unsupported the input tensor contains cls token")
+
+        # input shape: B, num_heads, seqlen, head_dim
+        B, N, L, C = get_shape(tensor)
+        T, H, W = thw_shape
+        tensor = ops.reshape()(tensor, [B * N, -1, H, W, C])
+
+        if self.norm_before_pool:
+            # If use BN, we apply norm before pooling instead of after pooling.
+            tensor = self.norm(tensor)
+            # We also empirically find that adding a GELU here is beneficial.
+            tensor = ops.elementwise(FuncEnum.GELU)(tensor)
+
+        tensor = self.pool(tensor)
+
+        shape = get_shape(tensor)
+        thw_shape = [shape[1], shape[2], shape[3]]
+        L_pooled = shape[1] * shape[2] * shape[3]
+        tensor = ops.reshape()(tensor, [B, N, L_pooled, C])
+
+        if self.has_norm and not self.norm_before_pool:
+            tensor = self.norm(tensor)
+
+        tensor = _squeeze_dims(tensor, tensor_dim)
+
+        return tensor, thw_shape
+
+
+class MultiScaleAttention(Module):
+    """
+    Implementation of a multiscale attention block. Compare to a conventional attention
+    block, a multiscale attention block optionally supports pooling (either
+    before or after qkv projection). If pooling is not used, a multiscale attention
+    block is equivalent to a conventional attention block.
+
+    ::
+                                   Input
+                                     |
+                    |----------------|-----------------|
+                    ↓                ↓                 ↓
+                  Linear           Linear            Linear
+                    &                &                 &
+                 Pool (Q)         Pool (K)          Pool (V)
+                    → -------------- ←                 |
+                             ↓                         |
+                       MatMul & Scale                  |
+                             ↓                         |
+                          Softmax                      |
+                             → ----------------------- ←
+                                         ↓
+                                   MatMul & Scale
+                                         ↓
+                                      DropOut
+    """
+
+    _version = 2
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        batch_size: int = 1,
+        qkv_bias: bool = False,
+        dropout_rate: float = 0.0,
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        norm_layer: Callable = LayerNorm,
+        has_cls_embed: bool = True,
+        pool_mode: str = "conv",
+        pool_first: bool = False,
+        residual_pool: bool = True,
+        depthwise_conv: bool = True,
+        bias_on: bool = True,
+        separate_qkv: bool = False,
+        max_seq_len: int = 6272,
+    ) -> None:
+        """
+        Args:
+            dim (int): Input feature dimension.
+            num_heads (int): Number of heads in the attention layer.
+            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+                bias. Default: False.
+            dropout_rate (float): Dropout rate.
+            kernel_q (_size_3_t): Pooling kernel size for q. If both pooling kernel
+                size and pooling stride size are 1 for all the dimensions, pooling is
+                disabled.
+            kernel_kv (_size_3_t): Pooling kernel size for kv. If both pooling kernel
+                size and pooling stride size are 1 for all the dimensions, pooling is
+                disabled.
+            stride_q (_size_3_t): Pooling kernel stride for q.
+            stride_kv (_size_3_t): Pooling kernel stride for kv.
+            norm_layer (Module): Normalization layer used after pooling.
+            has_cls_embed (bool): If set to True, the first token of the input tensor
+                should be a cls token. Otherwise, the input tensor does not contain a
+                cls token. Pooling is not applied to the cls token.
+            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+                (average pooling), and "max" (max pooling).
+            pool_first (bool): If set to True, pool is applied before qkv projection.
+                Otherwise, pool is applied after qkv projection. Default: False.
+            residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+            depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+            bias_on (bool): Whether use biases for linear layers.
+            separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        """
+
+        super().__init__()
+        assert pool_mode in ["conv", "avg", "max"]
+
+        self.pool_first = pool_first
+        self.dropout_rate = dropout_rate
+        self.num_heads = num_heads
+        self.dim = dim
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.has_cls_embed = has_cls_embed
+        self.residual_pool = residual_pool
+        self.separate_qkv = separate_qkv
+        self.max_seq_len = max_seq_len
+        padding_q = [int(q // 2) for q in kernel_q]
+        padding_kv = [int(kv // 2) for kv in kernel_kv]
+
+        # Set placeholders for torchscriptability, may not be actually used
+        self.q = self.k = self.v = self.qkv = Identity()
+        if self.pool_first or self.separate_qkv:
+            self.q = Linear(dim, dim, bias=qkv_bias)
+            self.k = Linear(dim, dim, bias=qkv_bias)
+            self.v = Linear(dim, dim, bias=qkv_bias)
+        else:
+            self.qkv = Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = Linear(dim, dim, bias=True if bias_on else False)
+
+        assert dropout_rate == 0.0
+        if dropout_rate > 0.0:
+            self.proj_drop = Dropout(dropout_rate)
+        else:
+            self.proj_drop = Identity()
+
+        # Skip pooling with kernel and stride size of (1, 1, 1).
+        if (
+            kernel_q is not None
+            and self._prod(kernel_q) == 1
+            and self._prod(stride_q) == 1
+        ):
+            kernel_q = None
+        if (
+            kernel_kv is not None
+            and self._prod(kernel_kv) == 1
+            and self._prod(stride_kv) == 1
+        ):
+            kernel_kv = None
+
+        if pool_mode in ["max", "avg"]:
+            raise NotImplementedError(f"Unsupported input dimension {pool_mode}")
+
+        ## TODO: add pool mode support for {"max", "avg"}
+
+        elif pool_mode == "conv":
+            self.pool_q = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_q,
+                    stride=stride_q,
+                    padding=padding_q,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_q is not None
+                else None
+            )
+
+            self.norm_q = norm_layer(head_dim) if kernel_q is not None else None
+            self.pool_k = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_kv is not None
+                else None
+            )
+            self.norm_k = norm_layer(head_dim) if kernel_kv is not None else None
+            self.pool_v = (
+                Conv3d(
+                    head_dim,
+                    head_dim,
+                    kernel_kv,
+                    stride=stride_kv,
+                    padding=padding_kv,
+                    groups=head_dim if depthwise_conv else 1,
+                    bias=False,
+                )
+                if kernel_kv is not None
+                else None
+            )
+
+            self.norm_v = norm_layer(head_dim) if kernel_kv is not None else None
+        else:
+            raise NotImplementedError(f"Unsupported model {pool_mode}")
+
+        # Will not be used if `separate_qkv == True`
+        self._attention_pool_q = _AttentionPool(
+            self.pool_q,
+            has_cls_embed=self.has_cls_embed,
+            norm=getattr(self, "norm_q", None),
+        )
+        self._attention_pool_k = _AttentionPool(
+            self.pool_k,
+            has_cls_embed=self.has_cls_embed,
+            norm=getattr(self, "norm_k", None),
+        )
+        self._attention_pool_v = _AttentionPool(
+            self.pool_v,
+            has_cls_embed=self.has_cls_embed,
+            norm=getattr(self, "norm_v", None),
+        )
+
+    def _qkv_proj(
+        self,
+        q: Tensor,
+        q_size: int,
+        k: Tensor,
+        k_size: int,
+        v: Tensor,
+        v_size: int,
+        batch_size: int,
+        chan_size: int,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        q = ops.permute()(
+            ops.reshape()(
+                self.q(q)[
+                    batch_size, q_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        k = ops.permute()(
+            ops.reshape()(
+                self.k(k)[
+                    batch_size, k_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        v = ops.permute()(
+            ops.reshape()(
+                self.v(v)[
+                    batch_size, v_size, self.num_heads, chan_size // self.num_heads
+                ]
+            ),
+            [0, 2, 1, 3],
+        )
+        return q, k, v
+
+    def _qkv_pool(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        thw_shape: List[int],
+    ) -> Tuple[Tensor, List[int], Tensor, List[int], Tensor, List[int]]:
+        q, q_shape = self._attention_pool_q(q, thw_shape)
+        k, k_shape = self._attention_pool_k(k, thw_shape)
+        v, v_shape = self._attention_pool_v(v, thw_shape)
+        return q, q_shape, k, k_shape, v, v_shape
+
+    def _get_qkv_length(
+        self,
+        q_shape: List[int],
+        k_shape: List[int],
+        v_shape: List[int],
+    ) -> Tuple[int, int, int]:
+        q_N = self._prod(q_shape) + 1 if self.has_cls_embed else self._prod(q_shape)
+        k_N = self._prod(k_shape) + 1 if self.has_cls_embed else self._prod(k_shape)
+        v_N = self._prod(v_shape) + 1 if self.has_cls_embed else self._prod(v_shape)
+        return q_N, k_N, v_N
+
+    def _prod(self, shape: List[int]) -> int:
+        """Torchscriptable version of `numpy.prod`. Note that `_prod([]) == 1`"""
+        p: int = 1
+        for dim in shape:
+            p *= dim
+        return p
+
+    def _reshape_qkv_to_seq(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        q_N: int,
+        v_N: int,
+        k_N: int,
+        B: int,
+        C: int,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        q = q.permute(0, 2, 1, 3).reshape(B, q_N, C)
+        v = v.permute(0, 2, 1, 3).reshape(B, v_N, C)
+        k = k.permute(0, 2, 1, 3).reshape(B, k_N, C)
+        return q, k, v
+
+    def forward(self, x: Tensor, thw_shape: List[int]) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            x (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+        """
+
+        B, N, C = get_shape(x)
+        if self.pool_first:
+            x = ops.reshape()(x, [B, N, self.num_heads, C // self.num_heads])
+            x = ops.permute()(x, [0, 2, 1, 3])
+            q = k = v = x
+            pass
+            q, q_shape, k, k_shape, v, v_shape = self._qkv_pool(q, k, v, thw_shape)
+            q_N, k_N, v_N = self._get_qkv_length(q_shape, k_shape, v_shape)
+            q, k, v = self._reshape_qkv_to_seq(q, k, v, q_N, v_N, k_N, B, C)
+            q, k, v = self._qkv_proj(q, q_N, k, k_N, v, v_N, B, C)
+        else:
+            if self.separate_qkv:
+                q = k = v = x
+                pass
+                # TODO: implement when separate_qkv
+                # q, k, v = self._qkv_proj(q, N, k, N, v, N, B, C)
+            else:
+                # compute q, k, v and perform pooling
+                qkv = ops.permute()(
+                    ops.reshape()(self.qkv(x), [B, N, 3, self.num_heads, -1]),
+                    [2, 0, 3, 1, 4],
+                )
+                # input shape: 3, B, num_heads, seqlen, head_dim
+                shape = get_shape(qkv)
+                # obtain q, k, v from qkv
+                qkv = ops.reshape()(qkv, [3 * B, self.num_heads, N, shape[-1]])
+                (q, k, v) = ops.split()(qkv, B, dim=0)
+            q, q_thw_shape, k, k_thw_shape, v, v_thw_shape = self._qkv_pool(
+                q, k, v, thw_shape
+            )
+
+        # attention
+        q_shape = get_shape(q)
+        B, num_heads, seqlen, head_dim = get_shape(q)
+        score = ops.transpose()(ops.mem_eff_attention(causal=False)(q, k, v), 1, 2)
+
+        if self.residual_pool:
+            score = ops.elementwise(FuncEnum.ADD)(score, q)
+
+        score = ops.reshape()(ops.transpose()(score, 1, 2), [B, -1, self.dim])
+
+        score = self.proj(score)
+        assert self.dropout_rate == 0.0
+        if self.dropout_rate > 0.0:
+            score = self.proj_drop(score)
+
+        return score, q_thw_shape
+
+
+class MultiScaleBlock(Module):
+    """
+    Implementation of a multiscale vision transformer block. Each block contains a
+    multiscale attention layer and a Mlp layer.
+
+    ::
+
+
+                                      Input
+                                        |-------------------+
+                                        ↓                   |
+                                       Norm                 |
+                                        ↓                   |
+                                MultiScaleAttention        Pool
+                                        ↓                   |
+                                     DropPath               |
+                                        ↓                   |
+                                    Summation ←-------------+
+                                        |
+                                        |-------------------+
+                                        ↓                   |
+                                       Norm                 |
+                                        ↓                   |
+                                       Mlp                 Proj
+                                        ↓                   |
+                                     DropPath               |
+                                        ↓                   |
+                                    Summation  ←------------+
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: int,
+        num_heads: int,
+        seq_len: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        dropout_rate: float = 0.0,
+        droppath_rate: float = 0.0,
+        act_layer: Module = GELU,
+        norm_layer: Module = LayerNorm,
+        attn_norm_layer: Module = LayerNorm,
+        kernel_q=(1, 1, 1),
+        kernel_kv=(1, 1, 1),
+        stride_q=(1, 1, 1),
+        stride_kv=(1, 1, 1),
+        pool_mode: str = "conv",
+        has_cls_embed: bool = True,
+        pool_first: bool = False,
+        residual_pool: bool = False,
+        depthwise_conv: bool = True,
+        bias_on: bool = True,
+        separate_qkv: bool = False,
+    ) -> None:
+        """
+        Args:
+            dim (int): Input feature dimension.
+            dim_out (int): Output feature dimension.
+            num_heads (int): Number of heads in the attention layer.
+            mlp_ratio (float): Mlp ratio which controls the feature dimension in the
+                hidden layer of the Mlp block.
+            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+                bias. Default: False.
+            dropout_rate (float): DropOut rate. If set to 0, DropOut is disabled.
+            droppath_rate (float): DropPath rate. If set to 0, DropPath is disabled.
+            act_layer (Module): Activation layer used in the Mlp layer.
+            norm_layer (Module): Normalization layer.
+            attn_norm_layer (Module): Normalization layer in the attention module.
+            kernel_q (_size_3_t): Pooling kernel size for q. If pooling kernel size is
+                1 for all the dimensions, pooling is not used (by default).
+            kernel_kv (_size_3_t): Pooling kernel size for kv. If pooling kernel size
+                is 1 for all the dimensions, pooling is not used. By default, pooling
+                is disabled.
+            stride_q (_size_3_t): Pooling kernel stride for q.
+            stride_kv (_size_3_t): Pooling kernel stride for kv.
+            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+                (average pooling), and "max" (max pooling).
+            has_cls_embed (bool): If set to True, the first token of the input tensor
+                should be a cls token. Otherwise, the input tensor does not contain a
+                cls token. Pooling is not applied to the cls token.
+            pool_first (bool): If set to True, pool is applied before qkv projection.
+                Otherwise, pool is applied after qkv projection. Default: False.
+            residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+            depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+            bias_on (bool): Whether use biases for linear layers.
+            separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        """
+        super().__init__()
+        self.dim = dim
+        self.dim_out = dim_out
+        self.norm1 = norm_layer(dim)
+        self.norm1_is_batchnorm_1d = isinstance(self.norm1, BatchNorm1d)
+        self.norm1.permute_input_output = True if self.norm1_is_batchnorm_1d else False
+        kernel_skip = [s + 1 if s > 1 else s for s in stride_q]
+        stride_skip = stride_q
+        padding_skip = [int(skip // 2) for skip in kernel_skip]
+        self.attn = MultiScaleAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            dropout_rate=dropout_rate,
+            kernel_q=kernel_q,
+            kernel_kv=kernel_kv,
+            stride_q=stride_q,
+            stride_kv=stride_kv,
+            norm_layer=attn_norm_layer,
+            has_cls_embed=has_cls_embed,
+            pool_mode=pool_mode,
+            pool_first=pool_first,
+            residual_pool=residual_pool,
+            bias_on=bias_on,
+            depthwise_conv=depthwise_conv,
+            separate_qkv=separate_qkv,
+            max_seq_len=seq_len,
+        )
+        self.drop_path = DropPath(droppath_rate) if droppath_rate > 0.0 else Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm2_is_batchnorm_1d = isinstance(self.norm2, BatchNorm1d)
+        self.norm2.permute_input_output = True if self.norm2_is_batchnorm_1d else False
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.has_cls_embed = has_cls_embed
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=dim_out,
+            act_layer=act_layer,
+            dropout_rate=dropout_rate,
+            bias_on=bias_on,
+        )
+        if dim != dim_out:
+            self.proj = Linear(dim, dim_out, bias=bias_on)
+        else:
+            self.proj = Identity()
+
+        self.pool_skip = (
+            MaxPool3d(tuple(kernel_skip), tuple(stride_skip), tuple(padding_skip))
+            if len(stride_skip) > 0 and numpy.prod(stride_skip) > 1
+            else None
+        )
+        self._attention_pool = _AttentionPool(
+            self.pool_skip, has_cls_embed=self.has_cls_embed, norm=None
+        )
+
+    def forward(
+        self, x: Tensor, t_shape: int, h_shape: int, w_shape: int
+    ) -> Tuple[Tensor, List[int]]:
+        """
+        Args:
+            x (Tensor): Input tensor.
+            thw_shape (List): The shape of the input tensor (before flattening).
+        """
+        thw_shape = [t_shape, h_shape, w_shape]
+        x_norm = (
+            ait_ncl2nlc(self.norm1(ait_ncl2nlc(x)))
+            if self.norm1_is_batchnorm_1d
+            else self.norm1(x)
+        )
+        x_block, thw_shape_new = self.attn(x_norm, thw_shape)
+        x_res, _ = self._attention_pool(x, thw_shape)
+        x = x_res + self.drop_path(x_block)
+        x_norm = (
+            ait_ncl2nlc(self.norm2(ait_ncl2nlc(x)))
+            if self.norm2_is_batchnorm_1d
+            else self.norm2(x)
+        )
+        x_mlp = self.mlp(x_norm)
+        if self.dim != self.dim_out:
+            x = self.proj(x_norm)
+        x = x + self.drop_path(x_mlp)
+
+        return x, thw_shape_new
diff --git a/python/aitemplate/frontend/nn/padding.py b/python/aitemplate/frontend/nn/padding.py
index c1a6efb6d..dfdca6fa9 100644
--- a/python/aitemplate/frontend/nn/padding.py
+++ b/python/aitemplate/frontend/nn/padding.py
@@ -15,8 +15,8 @@
 """
 Padding related modules.
 """
-from ...compiler.ops import nhwc3to8
-from .module import Module
+from aitemplate.compiler.ops import ndhwc3to8, nhwc3to8
+from aitemplate.frontend.nn.module import Module
 
 
 class Nhwc3to8(Module):
@@ -30,3 +30,16 @@ def forward(self, *args):
         assert len(args) == 1
         x = args[0]
         return self.op(x)
+
+
+class Ndhwc3to8(Module):
+    r"""Pads the input data with ndhwc dimensions from 3 channels to 8 channels"""
+
+    def __init__(self):
+        super().__init__()
+        self.op = ndhwc3to8()
+
+    def forward(self, *args):
+        assert len(args) == 1
+        x = args[0]
+        return self.op(x)
diff --git a/python/aitemplate/frontend/nn/parameter.py b/python/aitemplate/frontend/nn/parameter.py
index 8caea006c..660dd65b6 100644
--- a/python/aitemplate/frontend/nn/parameter.py
+++ b/python/aitemplate/frontend/nn/parameter.py
@@ -15,10 +15,10 @@
 """
 Parameter definition.
 """
-from ...compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
-class Parameter(object):
+class Parameter:
     def __init__(self, shape, dtype, name=None, value=None):
         self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
         self._value = value
diff --git a/python/aitemplate/frontend/nn/patch_embed.py b/python/aitemplate/frontend/nn/patch_embed.py
new file mode 100644
index 000000000..0d2658128
--- /dev/null
+++ b/python/aitemplate/frontend/nn/patch_embed.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+patch_embed Module.
+"""
+from typing import Callable, Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.module import Module
+
+
+class PatchEmbed(Module):
+    """
+    Transformer basic patch embedding module. Performs patchifying input and flatten
+
+    ::
+
+                                       PatchModel
+                                           ↓
+                                        flatten
+
+    ::
+
+    output shape: [N, D*H*W, C]
+
+    """
+
+    def __init__(
+        self,
+        patch_model,
+    ) -> None:
+        super().__init__()
+        self.patch_model = patch_model
+
+    def forward(self, *args) -> Tensor:
+        assert len(args) == 1
+        x = args[0]
+
+        x = self.patch_model(x)
+        x = ops.flatten(start_dim=1, end_dim=-2)(x)
+        return x
+
+
+def create_conv_patch_embed(
+    *,
+    in_channels: int,
+    out_channels: int,
+    conv_kernel_size: Tuple[int] = (1, 16, 16),
+    conv_stride: Tuple[int] = (1, 4, 4),
+    conv_padding: Tuple[int] = (1, 7, 7),
+    conv_bias: bool = True,
+    conv: Callable = Conv3d,
+) -> Module:
+    """
+    Creates the transformer basic patch embedding. It performs Convolution, flatten and
+    transpose.
+
+    ::
+
+                                        Conv3d
+                                           ↓
+                                        flatten
+                                           ↓
+                                       transpose
+
+    Args:
+        in_channels (int): input channel size of the convolution.
+        out_channels (int): output channel size of the convolution.
+        conv_kernel_size (tuple): convolutional kernel size(s).
+        conv_stride (tuple): convolutional stride size(s).
+        conv_padding (tuple): convolutional padding size(s).
+        conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
+            output.
+        conv (callable): Callable used to build the convolution layer.
+
+    Returns:
+        (nn.Module): transformer patch embedding layer.
+    """
+    conv_module = conv(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=conv_kernel_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        bias=conv_bias,
+    )
+    return PatchEmbed(patch_model=conv_module)
diff --git a/python/aitemplate/frontend/nn/pool2d.py b/python/aitemplate/frontend/nn/pool2d.py
index 212847d77..a1eb439c2 100644
--- a/python/aitemplate/frontend/nn/pool2d.py
+++ b/python/aitemplate/frontend/nn/pool2d.py
@@ -15,8 +15,8 @@
 """
 pool2d-family modules.
 """
-from ...compiler.ops import avg_pool2d, max_pool2d
-from .module import Module
+from aitemplate.compiler.ops import avg_pool2d, max_pool2d
+from aitemplate.frontend.nn.module import Module
 
 
 class MaxPool2d(Module):
diff --git a/python/aitemplate/frontend/nn/pool3d.py b/python/aitemplate/frontend/nn/pool3d.py
new file mode 100644
index 000000000..db663a3f0
--- /dev/null
+++ b/python/aitemplate/frontend/nn/pool3d.py
@@ -0,0 +1,115 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+pool3d-family modules.
+"""
+from aitemplate.compiler.ops import max_pool2d
+from aitemplate.compiler.ops.common import reshape
+from aitemplate.frontend.nn.module import Module
+
+
+def identical_elem_tuple_to_int(param):
+    """
+    Convert tuples with all the same int elem to
+    a single int (ex. (3, 3, 3) --> 3)
+    """
+    if isinstance(param, int):
+        return param
+
+    if not isinstance(param, (list, tuple)) or not all(x == param[0] for x in param):
+        raise RuntimeError(f"AIT supports square param values only, but got {param}")
+    return param[0]
+
+
+class MaxPool3d(Module):
+    r"""Applies a 3D max pooling over an input signal composed of several input
+    planes.
+
+    In the simplest case, the output value of the layer with input size :math:`(N, D, H, W, C)`,
+    output :math:`(N, D_{out}, H_{out}, W_{out}, C)` and :attr:`kernel_size` :math:`(kD, kH, kW)`
+    can be precisely described as:
+
+    .. math::
+        \begin{aligned}
+            \text{out}(N_i, d, h, w, C_j) ={} & \max_{k=0, \ldots, kD-1} \max_{m=0, \ldots, kH-1} \max_{n=0, \ldots, kW-1} \\
+                                              & \text{input}(N_i, C_j, \text{stride[0]} \times d + k,
+                                                             \text{stride[1]} \times h + m, \text{stride[2]} \times w + n)
+        \end{aligned}
+
+    If :attr:`padding` is non-zero, then the input is implicitly padded with negative infinity on both sides
+
+    Args:
+        kernel_size: the size of the window to take a max over
+        stride: the stride of the window
+        padding: implicit zero padding to be added on both sides
+    """
+
+    def __init__(self, kernel_size, stride=None, padding=0):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride if stride is not None else kernel_size
+        self.padding = padding
+
+    def forward(self, *args):
+        assert len(args) == 1
+        input_val = args[0]
+
+        if (
+            isinstance(self.kernel_size, tuple)
+            and isinstance(self.stride, tuple)
+            and isinstance(self.padding, tuple)
+        ):
+            kernel_size_tuple = self.kernel_size
+            stride_tuple = self.stride
+            padding_tuple = self.padding
+
+            assert (
+                kernel_size_tuple[0] == 1
+            ), "max_pool3d only supports kT == 1 currently"
+            assert stride_tuple[0] == 1, "max_pool3d only supports sT == 1 currently"
+            assert (
+                padding_tuple[0] == 0
+            ), "max_pool3d only supports T_padding == 0 currently"
+
+            kernel_size = identical_elem_tuple_to_int(kernel_size_tuple[1:])
+            stride = identical_elem_tuple_to_int(stride_tuple[1:])
+            padding = identical_elem_tuple_to_int(padding_tuple[1:])
+        elif (
+            isinstance(self.kernel_size, int)
+            and isinstance(self.stride, int)
+            and isinstance(self.padding, int)
+        ):
+            kernel_size = self.kernel_size
+            stride = self.stride
+            padding = self.padding
+        else:
+            raise RuntimeError("Only int or tuple types are supported")
+
+        N, D, H, W, C = input_val.shape()
+
+        reshape_op_0 = reshape()
+        shape_0 = (-1, H, W, C)
+        input_val = reshape_op_0(input_val, shape_0)
+
+        output = max_pool2d(kernel_size=kernel_size, stride=stride, pad=padding)(
+            input_val
+        )
+
+        _, H_o, W_o, _ = output.shape()
+        reshape_op_1 = reshape()
+        shape_1 = (N, D, H_o, W_o, C)
+
+        output = reshape_op_1(output, shape_1)
+        return output
diff --git a/python/aitemplate/frontend/nn/positional_encoding.py b/python/aitemplate/frontend/nn/positional_encoding.py
new file mode 100644
index 000000000..0566424ca
--- /dev/null
+++ b/python/aitemplate/frontend/nn/positional_encoding.py
@@ -0,0 +1,200 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+positional_encoding Modules.
+"""
+import logging
+from typing import Tuple
+
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
+
+_LOGGER = logging.getLogger(__name__)
+
+# These op implementations are copied from: https://fburl.com/code/o0qhusw6.
+# TODO: Move these to proper AIT op FEs
+def tile(input_val, dims):
+    shape_dims = list(dims)
+    input_dim_len = len(input_val.shape())
+    result = input_val
+    if len(shape_dims) < input_dim_len:
+        for _ in range(input_dim_len - len(shape_dims)):
+            shape_dims.insert(0, 1)
+    if input_dim_len < len(shape_dims):
+        shape = input_val.shape()
+        for _ in range(len(shape_dims) - input_dim_len):
+            shape.insert(0, IntImm(1))
+        result = ops.expand()(input_val, shape)
+
+    for i, shape in enumerate(shape_dims):
+        # Avoid operate on batch_size dim
+        if input_val.shape()[i]._attrs["name"] is not None:
+            continue
+        cat_groups = [result] * shape
+        result = ops.concatenate()(cat_groups, dim=i)
+    return result
+
+
+def repeat(input_val, dims):
+    if (
+        isinstance(dims, (list, tuple))
+        and len(dims) > 0
+        and not all(isinstance(x, int) for x in dims)
+    ):
+        _LOGGER.info("Not mapping repeat to an op. We can't handle variable dims.")
+        return input_val
+    return tile(input_val, dims)
+
+
+def repeat_interleave(input_val, repeats, dim=None):
+    if not (type(repeats) is int):
+        _LOGGER.info(
+            "Not mapping repeat_interleave to an acc op. We currently only support `repeat_interleave` with int repeats"
+        )
+        return
+    assert (
+        type(repeats) is int
+    ), "We currently only support `repeat_interleave` with int repeats"
+    rank = len(input_val.shape())
+    if dim is None:
+        repeat_dim = rank - 1
+    else:
+        assert type(dim) is int, "dim should be an int"
+        repeat_dim = dim
+    tile_dims = [1] * (rank + 1)
+    tile_dims[repeat_dim + 1] = repeats
+
+    x = ops.unsqueeze(repeat_dim + 1)(input_val)
+    x = tile(x, tuple(tile_dims))
+    new_shape = []
+    if dim is not None:
+        if dim < 0:
+            repeat_dim = dim + rank
+        else:
+            repeat_dim = dim
+        size_node = input_val.shape()
+        for i in range(rank):
+            shape_i = ops.getitem()(size_node, i)
+            if i == repeat_dim:
+                new_shape.append(-1)
+            else:
+                new_shape.append(shape_i)
+    else:
+        new_shape.append(-1)
+
+    x = ops.reshape()(x, new_shape)
+    return x
+
+
+class SpatioTemporalClsPositionalEncoding(Module):
+    """
+    Add a cls token and apply a spatiotemporal encoding to a tensor.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        patch_embed_shape: Tuple[int, int, int],
+        sep_pos_embed: bool = False,
+        has_cls: bool = True,
+        dtype: str = "float16",
+    ) -> None:
+        """
+        Args:
+            embed_dim (int): Embedding dimension for input sequence.
+            patch_embed_shape (Tuple): The number of patches in each dimension
+                (T, H, W) after patch embedding.
+            sep_pos_embed (bool): If set to true, one positional encoding is used for
+                spatial patches and another positional encoding is used for temporal
+                sequence. Otherwise, only one positional encoding is used for all the
+                patches.
+            has_cls (bool): If set to true, a cls token is added in the beginning of each
+                input sequence.
+        """
+        super().__init__()
+        assert (
+            len(patch_embed_shape) == 3
+        ), "Patch_embed_shape should be in the form of (T, H, W)."
+        self.cls_embed_on = has_cls
+        self.sep_pos_embed = sep_pos_embed
+        self._patch_embed_shape = tuple(patch_embed_shape)
+        self.num_spatial_patch = patch_embed_shape[1] * patch_embed_shape[2]
+        self.num_temporal_patch = patch_embed_shape[0]
+
+        if self.cls_embed_on:
+            self.cls_token = Parameter(shape=[1, 1, embed_dim], dtype=dtype)
+            num_patches = self.num_spatial_patch * self.num_temporal_patch + 1
+        else:
+            self.cls_token = Parameter(shape=[], value=0, dtype=dtype)
+            num_patches = self.num_spatial_patch * self.num_temporal_patch
+
+        if self.sep_pos_embed:
+            self.pos_embed_spatial = Parameter(
+                shape=[1, self.num_spatial_patch, embed_dim],
+                dtype=dtype,
+            )
+            self.pos_embed_temporal = Parameter(
+                shape=[1, self.num_temporal_patch, embed_dim],
+                dtype=dtype,
+            )
+            if self.cls_embed_on:
+                self.pos_embed_class = Parameter(shape=[1, 1, embed_dim], dtype=dtype)
+            else:
+                self.pos_embed_class = Parameter(shape=[], dtype=dtype)
+            self.pos_embed = Parameter(shape=[], dtype=dtype)
+
+        else:
+            self.pos_embed = Parameter(shape=[1, num_patches, embed_dim], dtype=dtype)
+            # Placeholders for torchscriptability, won't be used
+            self.pos_embed_spatial = Parameter(shape=[], dtype=dtype)
+            self.pos_embed_temporal = Parameter(shape=[], dtype=dtype)
+            self.pos_embed_class = Parameter(shape=[], dtype=dtype)
+
+    def patch_embed_shape(self) -> Tuple[int, int, int]:
+        return self._patch_embed_shape
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor.
+        """
+        B, N, C = x.shape()
+        if self.cls_embed_on:
+            cls_tokens = ops.expand()(self.cls_token.tensor(), [B, -1, -1])
+            x = ops.concatenate()([cls_tokens, x], dim=1)
+
+        if self.sep_pos_embed:
+            pos_embed = ops.elementwise(FuncEnum.ADD)(
+                repeat(
+                    self.pos_embed_spatial.tensor(), (1, self.num_temporal_patch, 1)
+                ),
+                repeat_interleave(
+                    self.pos_embed_temporal.tensor(), self.num_spatial_patch, dim=1
+                ),
+            )
+
+            if self.cls_embed_on:
+                pos_embed = ops.concatenate()(
+                    [self.pos_embed_class.tensor(), pos_embed], dim=1
+                )
+            x = ops.elementwise(FuncEnum.ADD)(x, pos_embed)
+        else:
+            x = ops.elementwise(FuncEnum.ADD)(x, self.pos_embed.tensor())
+
+        return x
diff --git a/python/aitemplate/frontend/nn/proposal.py b/python/aitemplate/frontend/nn/proposal.py
index ad6958414..18b53f313 100644
--- a/python/aitemplate/frontend/nn/proposal.py
+++ b/python/aitemplate/frontend/nn/proposal.py
@@ -19,10 +19,10 @@
 
 import numpy as np
 
-from ...compiler import ops
-from ...compiler.base import Tensor
-from ...compiler.ops.common.epilogue import FuncEnum
-from .module import Module
+from aitemplate.compiler import ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend.nn.module import Module
 
 
 def _mkanchors(widths, heights, x_ctr, y_ctr):
@@ -89,7 +89,7 @@ def generate_anchors(ratios=(0.5, 1, 2), scales=(8, 16, 32)):
     """
     anchors = []
     for size in scales:
-        area = size ** 2.0
+        area = size**2.0
         for aspect_ratio in ratios:
             im_w = math.sqrt(area / aspect_ratio)
             im_h = aspect_ratio * im_w
@@ -104,7 +104,13 @@ def generate_anchors(ratios=(0.5, 1, 2), scales=(8, 16, 32)):
 
 
 def generate_shifted_anchors(
-    im_h, im_w, feat_stride, scales, ratios, batch_size, dtype
+    im_h,
+    im_w,
+    feat_stride,
+    scales,
+    ratios,
+    batch_size,
+    dtype,
 ):
     """
     Enumerate all shifted anchors
@@ -134,13 +140,17 @@ def generate_shifted_anchors(
     return exp_anchors.astype(dtype)
 
 
-def gen_batch_inds(batch_size, rpn_post_nms_top_n):
+def gen_batch_inds(
+    batch_size,
+    rpn_post_nms_top_n,
+    dtype="float16",
+):
     if batch_size > 1:
         inds = np.arange(batch_size)
         batch_inds = np.repeat(inds.reshape(-1, 1), repeats=rpn_post_nms_top_n, axis=1)
-        return batch_inds.reshape(batch_size, rpn_post_nms_top_n, 1).astype("float16")
+        return batch_inds.reshape(batch_size, rpn_post_nms_top_n, 1).astype(dtype)
     else:
-        return np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype("float16")
+        return np.zeros((batch_size, rpn_post_nms_top_n, 1)).astype(dtype)
 
 
 class Proposal(Module):
@@ -186,7 +196,11 @@ def __init__(
             self.batch_size,
             self.dtype,
         )
-        self._batch_inds = gen_batch_inds(batch_size, rpn_post_nms_top_n)
+        self._batch_inds = gen_batch_inds(
+            batch_size,
+            rpn_post_nms_top_n,
+            dtype=dtype,
+        )
 
     def forward(self, *args):
         assert len(args) >= 1
@@ -236,36 +250,58 @@ def box_transform(self, bbox_deltas, anchors):
         ctr_y = ops.elementwise(FuncEnum.ADD)(anchor_y1, height_mid)
 
         pred_ctr_x = ops.elementwise(FuncEnum.ADD)(
-            ops.elementwise(FuncEnum.MUL)(delta_x, widths), ctr_x
+            ops.elementwise(FuncEnum.MUL)(delta_x, widths),
+            ctr_x,
         )
         pred_ctr_y = ops.elementwise(FuncEnum.ADD)(
-            ops.elementwise(FuncEnum.MUL)(delta_y, heights), ctr_y
+            ops.elementwise(FuncEnum.MUL)(delta_y, heights),
+            ctr_y,
         )
         pred_w = ops.elementwise(FuncEnum.MUL)(
-            ops.elementwise(FuncEnum.EXP)(delta_w), widths
+            ops.elementwise(FuncEnum.EXP)(delta_w),
+            widths,
         )
         pred_h = ops.elementwise(FuncEnum.MUL)(
-            ops.elementwise(FuncEnum.EXP)(delta_h), heights
+            ops.elementwise(FuncEnum.EXP)(delta_h),
+            heights,
         )
 
         p_x1 = ops.elementwise(FuncEnum.SUB)(
-            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+            pred_ctr_x,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w),
         )
         p_y1 = ops.elementwise(FuncEnum.SUB)(
-            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+            pred_ctr_y,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h),
         )
         p_x2 = ops.elementwise(FuncEnum.ADD)(
-            pred_ctr_x, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w)
+            pred_ctr_x,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_w),
         )
         p_y2 = ops.elementwise(FuncEnum.ADD)(
-            pred_ctr_y, ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h)
+            pred_ctr_y,
+            ops.elementwise(FuncEnum.MUL)(const_0_5, pred_h),
         )
 
         if self.clip_box:
-
-            x_min = Tensor(shape=[], dtype="float16", name="X_min", value=0)
-            x_max_h = Tensor(shape=[], dtype="float16", name="X_min_h", value=self.im_h)
-            x_max_w = Tensor(shape=[], dtype="float16", name="X_min_w", value=self.im_w)
+            x_min = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min",
+                value=0,
+            )
+            x_max_h = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min_h",
+                value=self.im_h,
+            )
+            x_max_w = Tensor(
+                shape=[],
+                dtype=self.dtype,
+                name="X_min_w",
+                value=self.im_w,
+            )
 
             f_x1 = ops.elementwise(FuncEnum.HARDTANH)(p_x1, x_min, x_max_w)
             f_y1 = ops.elementwise(FuncEnum.HARDTANH)(p_y1, x_min, x_max_h)
diff --git a/python/aitemplate/frontend/nn/roi_ops.py b/python/aitemplate/frontend/nn/roi_ops.py
index 12e1f7621..a3d17bbf2 100644
--- a/python/aitemplate/frontend/nn/roi_ops.py
+++ b/python/aitemplate/frontend/nn/roi_ops.py
@@ -15,8 +15,8 @@
 """
 RoiAlign-family modules.
 """
-from ...compiler.ops import multi_level_roi_align, roi_align
-from .module import Module
+from aitemplate.compiler.ops import multi_level_roi_align, roi_align
+from aitemplate.frontend.nn.module import Module
 
 
 class RoiAlign(Module):
diff --git a/python/aitemplate/frontend/nn/softmax.py b/python/aitemplate/frontend/nn/softmax.py
new file mode 100644
index 000000000..2a8ff0e5a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/softmax.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+softmax Module.
+"""
+from typing import Optional
+
+from aitemplate.compiler import ops
+from aitemplate.frontend.nn.module import Module
+
+
+class Softmax(Module):
+    r"""Applies the Softmax function to an n-dimensional input Tensor
+    rescaling them so that the elements of the n-dimensional output Tensor
+    lie in the range [0,1] and sum to 1.
+
+    Softmax is defined as:
+
+    .. math::
+        \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
+
+    When the input Tensor is a sparse tensor then the unspecified
+    values are treated as ``-inf``.
+
+    Shape:
+        - Input: :math:`(*)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(*)`, same shape as the input
+
+    Returns:
+        a Tensor of the same dimension and shape as the input with
+        values in the range [0, 1]
+
+    Args:
+        dim (int): A dimension along which Softmax will be computed (so every slice
+            along dim will sum to 1).
+
+    """
+
+    def __init__(
+        self,
+        dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, *args):
+        """Applies Softmax on the input tensor."""
+        assert len(args) == 1
+        x = args[0]
+
+        return ops.softmax(x, self.dim)
diff --git a/python/aitemplate/frontend/nn/upsample.py b/python/aitemplate/frontend/nn/upsample.py
index aa6a90edd..619d97236 100644
--- a/python/aitemplate/frontend/nn/upsample.py
+++ b/python/aitemplate/frontend/nn/upsample.py
@@ -15,8 +15,8 @@
 """
 Unsampling2d module.
 """
-from ...compiler.ops import upsampling2d, upsampling2d_add
-from .module import Module
+from aitemplate.compiler.ops import upsampling2d, upsampling2d_add
+from aitemplate.frontend.nn.module import Module
 
 
 class Upsampling2d(Module):
diff --git a/python/aitemplate/frontend/nn/vanilla_attention.py b/python/aitemplate/frontend/nn/vanilla_attention.py
index 7386b1aa4..7fe7f0377 100644
--- a/python/aitemplate/frontend/nn/vanilla_attention.py
+++ b/python/aitemplate/frontend/nn/vanilla_attention.py
@@ -17,12 +17,12 @@
 """
 from functools import partial
 
-from ...compiler import ops
-from .. import Tensor
-from .dropout import Dropout
-from .linear import Linear
-from .module import Module
-from .parameter import Parameter
+from aitemplate.compiler import ops
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.dropout import Dropout
+from aitemplate.frontend.nn.linear import Linear
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.parameter import Parameter
 
 # pylint: disable=C0103
 
@@ -97,7 +97,7 @@ class VanillaMultiheadAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
@@ -130,7 +130,7 @@ def __init__(
         ), f"dim {dim} should be divisible by num_heads {num_heads}"
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.scale = head_dim ** -0.5
+        self.scale = head_dim**-0.5
         self.causal = causal
         self.has_residual = has_residual
         self.mask_seq = mask_seq
@@ -212,7 +212,7 @@ class VanillaCrossAttention(Module):
     where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
 
     Args:
-        dim: toal dimension of the model
+        dim: total dimension of the model
         batch_size: batch size
         seq_len: sequence length
         num_heads: Number of parallel attention heads. Default: 8
diff --git a/python/aitemplate/frontend/nn/view_ops.py b/python/aitemplate/frontend/nn/view_ops.py
index f4afc902e..1406ed9bf 100644
--- a/python/aitemplate/frontend/nn/view_ops.py
+++ b/python/aitemplate/frontend/nn/view_ops.py
@@ -15,8 +15,8 @@
 """
 View-related modules.
 """
-from ...compiler.ops import flatten, reshape
-from .module import Module
+from aitemplate.compiler.ops import flatten, reshape
+from aitemplate.frontend.nn.module import Module
 
 
 class Reshape(Module):
diff --git a/python/aitemplate/frontend/nn/vision_transformers.py b/python/aitemplate/frontend/nn/vision_transformers.py
new file mode 100644
index 000000000..46dbf0b8a
--- /dev/null
+++ b/python/aitemplate/frontend/nn/vision_transformers.py
@@ -0,0 +1,443 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import warnings
+from functools import partial
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from pytorchvideo.layers.utils import round_width  # usort:skip
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.batch_norm import BatchNorm1d, BatchNorm3d
+from aitemplate.frontend.nn.container import ModuleList
+from aitemplate.frontend.nn.conv2d import Conv2d
+from aitemplate.frontend.nn.conv3d import Conv3d
+from aitemplate.frontend.nn.dropout import Dropout
+
+from aitemplate.frontend.nn.head import create_vit_basic_head
+from aitemplate.frontend.nn.identity import Identity
+from aitemplate.frontend.nn.layer_norm import LayerNorm
+from aitemplate.frontend.nn.module import Module
+from aitemplate.frontend.nn.multiscale_attention import MultiScaleBlock
+from aitemplate.frontend.nn.patch_embed import create_conv_patch_embed
+from aitemplate.frontend.nn.positional_encoding import (
+    SpatioTemporalClsPositionalEncoding,
+)
+
+
+class MultiscaleVisionTransformers(Module):
+    """
+    Multiscale Vision Transformers
+    Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra Malik,
+    Christoph Feichtenhofer
+    https://arxiv.org/abs/2104.11227
+
+    ::
+
+                                       PatchEmbed
+                                           ↓
+                                   PositionalEncoding
+                                           ↓
+                                        Dropout
+                                           ↓
+                                     Normalization
+                                           ↓
+                                         Block 1
+                                           ↓
+                                           .
+                                           .
+                                           .
+                                           ↓
+                                         Block N
+                                           ↓
+                                     Normalization
+                                           ↓
+                                          Head
+
+
+    The builder can be found in `create_mvit`.
+    """
+
+    def __init__(
+        self,
+        *,
+        patch_embed: Optional[Module],
+        cls_positional_encoding: Module,
+        pos_drop: Optional[Module],
+        blocks: ModuleList,
+        norm_embed: Optional[Module],
+        head: Optional[Module],
+    ) -> None:
+        """
+        Args:
+            patch_embed (nn.Module): Patch embed module.
+            cls_positional_encoding (nn.Module): Positional encoding module.
+            pos_drop (Optional[nn.Module]): Dropout module after patch embed.
+            blocks (nn.ModuleList): Stack of multi-scale transformer blocks.
+            norm_layer (nn.Module): Normalization layer before head.
+            head (Optional[nn.Module]): Head module.
+        """
+        super().__init__()
+
+        assert hasattr(
+            cls_positional_encoding, "patch_embed_shape"
+        ), "cls_positional_encoding should have method patch_embed_shape."
+
+        self.patch_embed = patch_embed or Identity()
+        self.cls_positional_encoding = cls_positional_encoding
+        self.pos_drop = pos_drop or Identity()
+        self.blocks = blocks
+        self.norm_embed = norm_embed or Identity()
+        self.head = head or Identity()
+
+    # TODO: Add support for batchnorm fusion
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.patch_embed(x)
+        x = self.cls_positional_encoding(x)
+        x = self.pos_drop(x)
+
+        thw = self.cls_positional_encoding.patch_embed_shape()
+        for blk in self.blocks:
+            t_shape, h_shape, w_shape = thw
+            x, thw = blk(x, t_shape, h_shape, w_shape)
+        x = self.norm_embed(x)
+        x = self.head(x)
+        return x
+
+
+def create_multiscale_vision_transformers(
+    *,
+    spatial_size: Union[int, Tuple[int, int]],
+    temporal_size: int,
+    cls_embed_on: bool = True,
+    sep_pos_embed: bool = True,
+    depth: int = 16,
+    norm: str = "layernorm",
+    # Patch embed config.
+    enable_patch_embed: bool = True,
+    input_channels: int = 3,
+    patch_embed_dim: int = 96,
+    conv_patch_embed_kernel: Tuple[int] = (3, 7, 7),
+    conv_patch_embed_stride: Tuple[int] = (2, 4, 4),
+    conv_patch_embed_padding: Tuple[int] = (1, 3, 3),
+    enable_patch_embed_norm: bool = False,
+    use_2d_patch: bool = False,
+    # Attention block config.
+    num_heads: int = 1,
+    mlp_ratio: float = 4.0,
+    qkv_bias: bool = True,
+    dropout_rate_block: float = 0.0,
+    droppath_rate_block: float = 0.0,
+    pooling_mode: str = "conv",
+    pool_first: bool = False,
+    residual_pool: bool = False,
+    depthwise_conv: bool = True,
+    bias_on: bool = True,
+    separate_qkv: bool = True,
+    embed_dim_mul: Optional[List[List[int]]] = None,
+    atten_head_mul: Optional[List[List[int]]] = None,
+    dim_mul_in_att: bool = False,
+    pool_q_stride_size: Optional[List[List[int]]] = None,
+    pool_kv_stride_size: Optional[List[List[int]]] = None,
+    pool_kv_stride_adaptive: Optional[Union[int, Tuple[int, int, int]]] = None,
+    pool_kvq_kernel: Optional[Union[int, Tuple[int, int, int]]] = None,
+    # Head config.
+    head: Optional[Callable] = create_vit_basic_head,
+    head_dropout_rate: float = 0.5,
+    head_activation: Callable = None,
+    head_num_classes: int = 400,
+    # The default model definition is not TorchScript-friendly.
+    # Set create_scriptable_model=True to create a TorchScriptable model.
+    create_scriptable_model: bool = False,
+    multiscale_vit_class: Callable = MultiscaleVisionTransformers,
+) -> Module:
+    """
+    Build Multiscale Vision Transformers (MViT) for recognition. A Vision Transformer
+    (ViT) is a specific case of MViT that only uses a single scale attention block.
+
+    Args:
+        spatial_size (_size_2_t): Input video spatial resolution (H, W). If a single
+            int is given, it assumes the width and the height are the same.
+        temporal_size (int): Number of frames in the input video.
+        cls_embed_on (bool): If True, use cls embed in the model. Otherwise features
+            are average pooled before going to the final classifier.
+        sep_pos_embed (bool): If True, perform separate spatiotemporal embedding.
+        depth (int): The depth of the model.
+        norm (str): Normalization layer. It currently supports "layernorm".
+
+        enable_patch_embed (bool): If true, patchify the input video. If false, it
+            assumes the input should have the feature dimension of patch_embed_dim.
+        input_channels (int): Channel dimension of the input video.
+        patch_embed_dim (int): Embedding dimension after patchifing the video input.
+        conv_patch_embed_kernel (Tuple[int]): Kernel size of the convolution for
+            patchifing the video input.
+        conv_patch_embed_stride (Tuple[int]): Stride size of the convolution for
+            patchifing the video input.
+        conv_patch_embed_padding (Tuple[int]): Padding size of the convolution for
+            patchifing the video input.
+        enable_patch_embed_norm (bool): If True, apply normalization after patchifing
+            the video input.
+        use_2d_patch (bool): If True, use 2D convolutions to get patch embed.
+            Otherwise, use 3D convolutions.
+
+        num_heads (int): Number of heads in the first transformer block.
+        mlp_ratio (float): Mlp ratio which controls the feature dimension in the
+            hidden layer of the Mlp block.
+        qkv_bias (bool): If set to False, the qkv layer will not learn an additive
+            bias. Default: True.
+        dropout_rate_block (float): Dropout rate for the attention block.
+        droppath_rate_block (float): Droppath rate for the attention block.
+        pooling_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
+            (average pooling), and "max" (max pooling).
+        pool_first (bool): If set to True, pool is applied before qkv projection.
+            Otherwise, pool is applied after qkv projection. Default: False.
+        residual_pool (bool): If set to True, use Improved Multiscale Vision
+                Transformer's pooling residual connection.
+        depthwise_conv (bool): Whether use depthwise or full convolution for pooling.
+        bias_on (bool): Whether use biases for linear layers.
+        separate_qkv (bool): Whether to use separate or one layer for qkv projections.
+        embed_dim_mul (Optional[List[List[int]]]): Dimension multiplication at layer i.
+            If X is used, then the next block will increase the embed dimension by X
+            times. Format: [depth_i, mul_dim_ratio].
+        atten_head_mul (Optional[List[List[int]]]): Head dimension multiplication at
+            layer i. If X is used, then the next block will increase the head by
+            X times. Format: [depth_i, mul_dim_ratio].
+        dim_mul_in_att (bool): If set to True, dimension expansion happens inside
+                the attention module, otherwise it happens in the Mlp block. Default: False.
+        pool_q_stride_size (Optional[List[List[int]]]): List of stride sizes for the
+            pool q at each layer. Format:
+            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
+        pool_kv_stride_size (Optional[List[List[int]]]): List of stride sizes for the
+            pool kv at each layer. Format:
+            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
+        pool_kv_stride_adaptive (Optional[_size_3_t]): Initial kv stride size for the
+            first block. The stride size will be further reduced at the layer where q
+            is pooled with the ratio of the stride of q pooling. If
+            pool_kv_stride_adaptive is set, then pool_kv_stride_size should be none.
+        pool_kvq_kernel (Optional[_size_3_t]): Pooling kernel size for q and kv. It None,
+            the kernel_size is [s + 1 if s > 1 else s for s in stride_size].
+
+        head (Callable): Head model.
+        head_dropout_rate (float): Dropout rate in the head.
+        head_activation (Callable): Activation in the head.
+        head_num_classes (int): Number of classes in the final classification head.
+        multiscale_vit_class (Callable): MViT transformer class. Default to
+            MultiscaleVisionTransformers.
+
+    Example usage (building a MViT_B model for Kinetics400):
+
+        spatial_size = 224
+        temporal_size = 16
+        embed_dim_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
+        atten_head_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
+        pool_q_stride_size = [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]]
+        pool_kv_stride_adaptive = [1, 8, 8]
+        pool_kvq_kernel = [3, 3, 3]
+        head_num_classes = 400
+        MViT_B = create_multiscale_vision_transformers(
+            spatial_size=spatial_size,
+            temporal_size=temporal_size,
+            embed_dim_mul=embed_dim_mul,
+            atten_head_mul=atten_head_mul,
+            pool_q_stride_size=pool_q_stride_size,
+            pool_kv_stride_adaptive=pool_kv_stride_adaptive,
+            pool_kvq_kernel=pool_kvq_kernel,
+            head_num_classes=head_num_classes,
+        )
+    """
+
+    if use_2d_patch:
+        assert temporal_size == 1, "If use_2d_patch, temporal_size needs to be 1."
+    if pool_kv_stride_adaptive is not None:
+        assert (
+            pool_kv_stride_size is None
+        ), "pool_kv_stride_size should be none if pool_kv_stride_adaptive is set."
+    if norm == "layernorm":
+        norm_layer = partial(LayerNorm, eps=1e-6)
+        block_norm_layer = partial(LayerNorm, eps=1e-6)
+        attn_norm_layer = partial(LayerNorm, eps=1e-6)
+    elif norm == "batchnorm":
+        norm_layer = None
+        block_norm_layer = BatchNorm1d
+        attn_norm_layer = BatchNorm3d
+    else:
+        raise NotImplementedError("Only supports layernorm.")
+    if create_scriptable_model:
+        assert (
+            norm == "batchnorm"
+        ), "The scriptable model supports only the batchnorm-based model."
+        warnings.warn(
+            "`create_scriptable_model` is deprecated. MultiscaleVisionTransformers"
+            " now supports scripting without this flag.",
+            DeprecationWarning,
+        )
+
+    if isinstance(spatial_size, int):
+        spatial_size = (spatial_size, spatial_size)
+
+    conv_patch_op = Conv2d if use_2d_patch else Conv3d
+
+    patch_embed = (
+        create_conv_patch_embed(
+            in_channels=input_channels,
+            out_channels=patch_embed_dim,
+            conv_kernel_size=conv_patch_embed_kernel,
+            conv_stride=conv_patch_embed_stride,
+            conv_padding=conv_patch_embed_padding,
+            conv=conv_patch_op,
+        )
+        if enable_patch_embed
+        else None
+    )
+
+    input_dims = [temporal_size, spatial_size[0], spatial_size[1]]
+    input_stride = (
+        (1,) + tuple(conv_patch_embed_stride)
+        if use_2d_patch
+        else conv_patch_embed_stride
+    )
+
+    patch_embed_shape = (
+        [input_dims[i] // input_stride[i] for i in range(len(input_dims))]
+        if enable_patch_embed
+        else input_dims
+    )
+
+    cls_positional_encoding = SpatioTemporalClsPositionalEncoding(
+        embed_dim=patch_embed_dim,
+        patch_embed_shape=patch_embed_shape,
+        sep_pos_embed=sep_pos_embed,
+        has_cls=cls_embed_on,
+    )
+
+    dpr = [
+        x.item() for x in torch.linspace(0, droppath_rate_block, depth)
+    ]  # stochastic depth decay rule
+
+    if dropout_rate_block > 0.0:
+        pos_drop = Dropout(p=dropout_rate_block)
+
+    dim_mul, head_mul = torch.ones(depth + 1), torch.ones(depth + 1)
+    if embed_dim_mul is not None:
+        for i in range(len(embed_dim_mul)):
+            dim_mul[embed_dim_mul[i][0]] = embed_dim_mul[i][1]
+    if atten_head_mul is not None:
+        for i in range(len(atten_head_mul)):
+            head_mul[atten_head_mul[i][0]] = atten_head_mul[i][1]
+
+    mvit_blocks = ModuleList()
+
+    pool_q = [[] for i in range(depth)]
+    pool_kv = [[] for i in range(depth)]
+    stride_q = [[] for i in range(depth)]
+    stride_kv = [[] for i in range(depth)]
+
+    if pool_q_stride_size is not None:
+        for i in range(len(pool_q_stride_size)):
+            stride_q[pool_q_stride_size[i][0]] = pool_q_stride_size[i][1:]
+            if pool_kvq_kernel is not None:
+                pool_q[pool_q_stride_size[i][0]] = pool_kvq_kernel
+            else:
+                pool_q[pool_q_stride_size[i][0]] = [
+                    s + 1 if s > 1 else s for s in pool_q_stride_size[i][1:]
+                ]
+
+    # If POOL_KV_STRIDE_ADAPTIVE is not None, initialize POOL_KV_STRIDE.
+    if pool_kv_stride_adaptive is not None:
+        _stride_kv = pool_kv_stride_adaptive
+        pool_kv_stride_size = []
+        for i in range(depth):
+            if len(stride_q[i]) > 0:
+                _stride_kv = [
+                    max(_stride_kv[d] // stride_q[i][d], 1)
+                    for d in range(len(_stride_kv))
+                ]
+            pool_kv_stride_size.append([i] + _stride_kv)
+
+    if pool_kv_stride_size is not None:
+        for i in range(len(pool_kv_stride_size)):
+            stride_kv[pool_kv_stride_size[i][0]] = pool_kv_stride_size[i][1:]
+            if pool_kvq_kernel is not None:
+                pool_kv[pool_kv_stride_size[i][0]] = pool_kvq_kernel
+            else:
+                pool_kv[pool_kv_stride_size[i][0]] = [
+                    s + 1 if s > 1 else s for s in pool_kv_stride_size[i][1:]
+                ]
+
+    dim_in = patch_embed_dim
+    for i in range(depth):
+        num_heads = round_width(num_heads, head_mul[i], min_width=1, divisor=1)
+        if dim_mul_in_att:
+            dim_out = round_width(
+                dim_in,
+                dim_mul[i],
+                divisor=round_width(num_heads, head_mul[i]),
+            )
+        else:
+            dim_out = round_width(
+                dim_in,
+                dim_mul[i + 1],
+                divisor=round_width(num_heads, head_mul[i + 1]),
+            )
+
+        mvit_blocks.append(
+            MultiScaleBlock(
+                dim=dim_in,
+                dim_out=dim_out,
+                num_heads=num_heads,
+                seq_len=6272,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                dropout_rate=dropout_rate_block,
+                droppath_rate=dpr[i],
+                norm_layer=block_norm_layer,
+                attn_norm_layer=attn_norm_layer,
+                kernel_q=pool_q[i],
+                kernel_kv=pool_kv[i],
+                stride_q=stride_q[i],
+                stride_kv=stride_kv[i],
+                pool_mode=pooling_mode,
+                has_cls_embed=cls_embed_on,
+                pool_first=pool_first,
+                residual_pool=residual_pool,
+                bias_on=bias_on,
+                depthwise_conv=depthwise_conv,
+                separate_qkv=separate_qkv,
+            )
+        )
+        dim_in = dim_out
+
+    norm_embed = None if norm_layer is None else norm_layer(dim_in)
+    if head is not None:
+        head_model = head(
+            in_features=dim_in,
+            out_features=head_num_classes,
+            seq_pool_type="cls" if cls_embed_on else "mean",
+            dropout_rate=head_dropout_rate,
+            activation=head_activation,
+        )
+    else:
+        head_model = None
+
+    return multiscale_vit_class(
+        patch_embed=patch_embed,
+        cls_positional_encoding=cls_positional_encoding,
+        pos_drop=pos_drop if dropout_rate_block > 0.0 else None,
+        blocks=mvit_blocks,
+        norm_embed=norm_embed,
+        head=head_model,
+    )
diff --git a/python/aitemplate/frontend/parameter.py b/python/aitemplate/frontend/parameter.py
index ebb060bd7..660dd65b6 100644
--- a/python/aitemplate/frontend/parameter.py
+++ b/python/aitemplate/frontend/parameter.py
@@ -15,10 +15,10 @@
 """
 Parameter definition.
 """
-from ..compiler.base import Tensor
+from aitemplate.compiler.base import Tensor
 
 
-class Parameter(object):
+class Parameter:
     def __init__(self, shape, dtype, name=None, value=None):
         self._tensor = Tensor(shape=shape, dtype=dtype, name=name)
         self._value = value
diff --git a/python/aitemplate/testing/__init__.py b/python/aitemplate/testing/__init__.py
index 7aeed2679..746641f05 100644
--- a/python/aitemplate/testing/__init__.py
+++ b/python/aitemplate/testing/__init__.py
@@ -15,11 +15,8 @@
 """
 testing module
 """
-from . import benchmark_ait, benchmark_pt
-from .detect_target import detect_target
+from aitemplate.testing import benchmark_ait, benchmark_pt
+from aitemplate.testing.detect_target import detect_target
+from aitemplate.testing.profile import profile_callable
 
-__all__ = [
-    "benchmark_pt",
-    "benchmark_ait",
-    "detect_target",
-]
+__all__ = ["benchmark_pt", "benchmark_ait", "detect_target", "profile_callable"]
diff --git a/python/aitemplate/testing/benchmark_trt.py b/python/aitemplate/testing/benchmark_trt.py
new file mode 100644
index 000000000..ebd22e841
--- /dev/null
+++ b/python/aitemplate/testing/benchmark_trt.py
@@ -0,0 +1,60 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+helper functions to benchmark fx-trt
+"""
+from aitemplate.testing.benchmark_pt import benchmark_torch_function  # usort:skip
+from torch_tensorrt.fx import lower
+from torch_tensorrt.fx.utils import LowerPrecision
+
+
+def make_trt_module(
+    function,
+    *inputs,
+    max_batch_size=256,
+    max_workspace_size=2 << 31,
+    dtype="float16",
+    dynamic_batch=False,
+):
+    if dtype == "float16":
+        lower_precision = LowerPrecision.FP16
+    elif dtype == "float32":
+        lower_precision = LowerPrecision.FP32
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+    return lower.compile(
+        function,
+        inputs,
+        min_acc_module_size=1,
+        max_batch_size=max_batch_size,
+        max_workspace_size=max_workspace_size,
+        lower_precision=lower_precision,
+        verbose_log=True,
+        timing_cache_prefix=True,
+        save_timing_cache=True,
+        explicit_batch_dimension=True,
+        dynamic_batch=dynamic_batch,
+    )
+
+
+def benchmark_trt_function(iters: int, function, *args) -> float:
+    submod = make_trt_module(function, args)
+    submod(*args)
+    return benchmark_torch_function(
+        iters,
+        submod,
+        *args,
+    )
diff --git a/python/aitemplate/testing/detect_target.py b/python/aitemplate/testing/detect_target.py
index 9e1867f44..69117600a 100644
--- a/python/aitemplate/testing/detect_target.py
+++ b/python/aitemplate/testing/detect_target.py
@@ -15,19 +15,22 @@
 """
 Automatic detect target for testing
 """
+import logging
 import os
 from subprocess import PIPE, Popen
 
-from ..backend.target import CUDA, ROCM
-from ..utils import logger
+from aitemplate.backend.target import CUDA, ROCM
 
 # pylint: disable=W0702, W0612,R1732
 
+
+_LOGGER = logging.getLogger(__name__)
+
 IS_CUDA = None
 FLAG = ""
 
 
-def _detect_cuda():
+def _detect_cuda_with_nvidia_smi():
     try:
         proc = Popen(
             ["nvidia-smi", "--query-gpu=gpu_name", "--format=csv"],
@@ -36,13 +39,46 @@ def _detect_cuda():
         )
         stdout, stderr = proc.communicate()
         stdout = stdout.decode("utf-8")
-        if "A100" in stdout or "RTX 30" in stdout or "A30" in stdout:
+        sm_names = {
+            "70": ["V100"],
+            "75": ["T4", "Quadro T2000"],
+            "80": ["PG509", "A100", "A10G", "RTX 30", "A30", "RTX 40"],
+            "90": ["H100"],
+        }
+        for sm, names in sm_names.items():
+            if any(name in stdout for name in names):
+                return sm
+        return None
+    except Exception:
+        return None
+
+
+def _detect_cuda():
+    try:
+        from cuda import cuda
+
+        def assert_cuda(res):
+            if res[0].value != 0:
+                raise RuntimeError(f"CUDA error code={res[0].value}")
+            return res[1:]
+
+        assert_cuda(cuda.cuInit(0))
+        # Get Compute Capability of the first Visible device
+        major, minor = assert_cuda(cuda.cuDeviceComputeCapability(0))
+        comp_cap = major * 10 + minor
+        if comp_cap >= 90:
+            return "90"
+        elif comp_cap >= 80:
             return "80"
-        if "V100" in stdout:
-            return "70"
-        if "T4" in stdout:
+        elif comp_cap >= 75:
             return "75"
-        return None
+        elif comp_cap >= 70:
+            return "70"
+        else:
+            return None
+    except ImportError:
+        # go back to old way to detect the CUDA arch
+        return _detect_cuda_with_nvidia_smi()
     except Exception:
         return None
 
@@ -56,6 +92,12 @@ def _detect_rocm():
             return "gfx90a"
         if "gfx908" in stdout:
             return "gfx908"
+        if "gfx940" in stdout:
+            return "gfx940"
+        if "gfx941" in stdout:
+            return "gfx941"
+        if "gfx942" in stdout:
+            return "gfx942"
         return None
     except Exception:
         return None
@@ -76,7 +118,7 @@ def detect_target(**kwargs):
             return CUDA(arch=FLAG, **kwargs)
         else:
             return ROCM(arch=FLAG, **kwargs)
-    doc_flag = os.getenv("BUILD_DOCS", None)
+    doc_flag = os.getenv("AIT_BUILD_DOCS", None)
     if doc_flag is not None:
         return CUDA(arch="80", **kwargs)
     flag = _detect_cuda()
@@ -84,13 +126,13 @@ def detect_target(**kwargs):
         IS_CUDA = True
         FLAG = flag
 
-        logger.info(__name__, "Set target to CUDA")
+        _LOGGER.info("Set target to CUDA")
         return CUDA(arch=flag, **kwargs)
     flag = _detect_rocm()
     if flag is not None:
         IS_CUDA = False
         FLAG = flag
 
-        logger.info(__name__, "Set target to ROCM")
+        _LOGGER.info("Set target to ROCM")
         return ROCM(arch=flag, **kwargs)
     raise RuntimeError("Unsupported platform")
diff --git a/python/aitemplate/testing/jagged_utils.py b/python/aitemplate/testing/jagged_utils.py
new file mode 100644
index 000000000..f8f4cb3d9
--- /dev/null
+++ b/python/aitemplate/testing/jagged_utils.py
@@ -0,0 +1,403 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+from itertools import product
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.testing.test_utils import get_torch_full_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype, torch_dtype_to_string
+
+
+def _check_offsets(
+    offsets_list: List[List[int]],
+) -> None:
+    offsets_len = len(offsets_list[0])
+    for offsets in offsets_list:
+        assert offsets[0] == 0
+        assert len(offsets) == offsets_len
+        for j in range(1, len(offsets)):
+            assert offsets[j] >= offsets[j - 1]
+        offsets_len = offsets[-1] + 1
+
+
+def _get_preceding_offset_idx(
+    idx: int,
+    offsets: List[int],
+) -> Tuple[int, int]:
+    result = None
+    left, right = 0, len(offsets) - 1
+    while left <= right:
+        mid = (left + right) // 2
+        offset = offsets[mid]
+        if offset <= idx:
+            result = mid
+            left = mid + 1
+        else:
+            right = mid - 1
+
+    return result, offsets[result]
+
+
+def _jagged_idx_to_dense_idx(
+    jagged_idx: int,
+    offsets_list: List[List[int]],
+) -> List[int]:
+    assert jagged_idx < offsets_list[-1][-1]
+
+    result = []
+    for offsets in reversed(offsets_list):
+        offset_idx, offset = _get_preceding_offset_idx(
+            idx=jagged_idx,
+            offsets=offsets,
+        )
+        result.append(jagged_idx - offset)
+        jagged_idx = offset_idx
+    result.append(jagged_idx)
+
+    return list(reversed(result))
+
+
+def jagged_to_dense(
+    jagged: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    dense_shape: List[int],
+    padding_value: float = 0.0,
+) -> torch.Tensor:
+    """
+    Convert a jagged Tensor (with the offsets) into a dense Tensor.
+
+    The function converts a jagged Tensor (and the offsets) to
+    a rectangular dense Tensor, using the padding_value at the
+    positions of the resulting dense Tensor where the input
+    jagged Tensor doesn't have elements.
+
+    Parameters
+    ----------
+    jagged : torch.Tensor
+        The jagged Tensor with the shape `[total_length, D1, ..., Dm]`,
+        The first dimension, `total_length`, encodes the `batch_dim` and
+        the jagged dims of the jagged Tensor. The following m dimensions,
+        `D1, ..., Dm`, are regular dense dimensions following the jagged
+        dimensions of the jagged Tensor.
+
+    offsets_list : List[torch.Tensor]
+        A list of rank-1 Tensors, each representing the offsets along one
+        of the jagged dimensions of the jagged Tensor. The number of offsets
+        Tensors in the list must correspond to the number of jagged dimensions
+        encoded in the first `total_length` dimension of `jagged`. The offsets
+        Tensors must be consistent with the offset specification:
+
+            - batch_dim == len(offsets[0]) - 1
+            - offsets[i][-1] == len(offsets[i+1])) - 1
+            - offsets[-1][-1] == total_length
+
+    dense_shape : List[int]
+        The shape of the resulting dense Tensor. The last m dimensions in
+        the `dense_shape` must be equal to `[D1, ..., Dm]` in the jagged
+        Tensor shape. The first dimension must be the `batch_dim`. The
+        following n dimensions must correspond to the n jagged dimensions
+        of the jagged Tensor, with the values equal to the maximum possible
+        values of the jagged dimensions.
+
+    padding_value : float
+        The value to fill the dense Tensor with at the positions where
+        there are no elements in the jagged Tensor. Default: 0.0.
+
+    Returns
+    -------
+    torch.Tensor
+        The dense tensor with the `dense_shape` converted from the
+        `jagged` Tensor, with the `padding_value` at other positions.
+    """
+    assert all(t.dim() == 1 for t in offsets_list)
+    offsets_list = [list(t.cpu().numpy()) for t in offsets_list]
+
+    _check_offsets(offsets_list)
+    assert len(dense_shape) - len(jagged.shape) == len(offsets_list)
+    assert jagged.shape[1:] == tuple(dense_shape[1 + len(offsets_list) :])
+    for i, offsets in enumerate(offsets_list):
+        dense_dim = dense_shape[i + 1]
+        for j in range(1, len(offsets)):
+            assert offsets[j] - offsets[j - 1] <= dense_dim
+
+    dtype = torch_dtype_to_string(jagged.dtype)
+    result = get_torch_full_tensor(
+        shape=dense_shape,
+        fill_value=padding_value,
+        dtype=dtype,
+    )
+
+    total_length = jagged.shape[0]
+    for jagged_idx in range(total_length):
+        dense_idx = _jagged_idx_to_dense_idx(
+            jagged_idx=jagged_idx,
+            offsets_list=offsets_list,
+        )
+        result[tuple(dense_idx)] = jagged[jagged_idx]
+
+    return result
+
+
+def _dense_idx_to_jagged_idx(
+    dense_idx: List[int],
+    offsets_list: List[List[int]],
+) -> int:
+    assert len(dense_idx) == 1 + len(offsets_list)
+
+    offset = 0
+    for i, (d, offsets) in enumerate(zip(dense_idx, offsets_list)):
+        prev_offset, next_offset = offsets[offset + d : offset + d + 2]
+        group_size = next_offset - prev_offset
+        if dense_idx[i + 1] >= group_size:
+            return -1
+        offset = prev_offset
+    offset += dense_idx[-1]
+
+    return offset
+
+
+def dense_to_jagged(
+    dense: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    padding_value: float = 0.0,
+) -> torch.Tensor:
+    """
+    Convert a dense Tensor into a jagged Tensor (using the offsets).
+
+    The function converts a rectangular dense Tensor to a compactly
+    represented subset of its values: a jagged Tensor, using the offsets.
+    The padding_value is used at the positions of the resulting jagged
+    Tensor where the input dense Tensor doesn't have elements.
+
+    Parameters
+    ----------
+    dense : torch.Tensor
+        A Tensor with the shape `[batch_dim, N1, ..., Nn, D1, ..., Dm]`.
+        The first n+1 dimensions of the dense Tensor are encoded into
+        the first `total_length` dimension of the resulting jagged
+        Tensor, using the specified offsets. Importantly, the values in
+        the dense Tensor outside of what the offsets specify are omitted
+        in the resulting jagged Tensor.
+
+    offsets_list : List[torch.Tensor]
+        A list of rank-1 Tensors, each representing the offsets along one
+        of the jagged dimensions of the jagged Tensor. The number of offsets
+        Tensors in the list must correspond to the number of jagged dimensions
+        encoded in the first `total_length` dimension of the resulting jagged
+        Tensor. The offsets Tensors must be consistent with the offset
+        specification:
+
+            - batch_dim == len(offsets[0]) - 1
+            - offsets[i][-1] == len(offsets[i+1])) - 1
+            - offsets[-1][-1] == total_length
+
+    padding_value : float
+        The value to fill the jagged Tensor with at the positions where
+        there are no elements in the dense Tensor (e.g., the consecutive
+        offset difference is longer than the corresponding N dimension
+        in the dense Tensor input). Default: 0.0.
+
+    Returns
+    -------
+    torch.Tensor
+        The jagged tensor converted from the `dense` Tensor using
+        the offsets, with the `padding_value` at the positions
+        not available in the `dense` Tensor.
+    """
+    assert all(t.dim() == 1 for t in offsets_list)
+    offsets_list = [list(t.cpu().numpy()) for t in offsets_list]
+
+    _check_offsets(offsets_list)
+    assert len(offsets_list) < len(dense.shape)
+
+    total_length = offsets_list[-1][-1]
+    inner_shape = dense.shape[1 + len(offsets_list) :]
+    jagged_shape = [total_length, *inner_shape]
+
+    dtype = torch_dtype_to_string(dense.dtype)
+    result = get_torch_full_tensor(
+        shape=jagged_shape,
+        fill_value=padding_value,
+        dtype=dtype,
+    )
+
+    for dense_idx in product(*[range(d) for d in dense.shape[: 1 + len(offsets_list)]]):
+        jagged_idx = _dense_idx_to_jagged_idx(
+            dense_idx=dense_idx,
+            offsets_list=offsets_list,
+        )
+        if jagged_idx != -1:
+            result[jagged_idx] = dense[tuple(dense_idx)]
+
+    return result
+
+
+def generate_offsets(
+    batch_size: int,
+    max_seq_len: int,
+    load_factor: float,
+    offsets_dtype: str,
+    spread_radius: float = 0.1,
+) -> torch.Tensor:
+    """
+    Generate a rank-1 Tensor of offsets for the given load factor.
+
+    This function generates a single linear offset Tensor for a
+    single jagged dimension in a jagged Tensor with the batch_dim
+    equal to `batch_size` and maximum value along the jagged
+    dimension equal to `max_seq_len`. The `load_factor` in [0, 1]
+    specifies how "full" should the jagged Tensor described by
+    the resulting offsets should be, compared to the corresponding
+    dense Tensor with a rectangular shape [batch_size, max_seq_len,
+    D1, ..., Dm]. The offset differences (== the lengths along the
+    jagged dimensions) are sampled randomly, to arrive close (but not
+    necessarily equal) to the specified `load_factor` in total.
+
+    When sampled out of the [0, N] interval, the offset differences
+    are clamed to stay within the [0, N] interval.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch_dim of the jagged Tensor specified by the offsets.
+    max_seq_len : int
+        The maximum length along the jagged dimension specified by
+        the offsets.
+    load_factor : float
+        The fraction of the [batch_size, max_seq_len, D1, ..., Dm]-
+        shaped dense Tensor that the total (compactly represented)
+        jagged Tensor data should correspond to.
+    offsets_dtype : str
+        The type of the resulting offsets Tensor.
+    spread_radius : float
+        The radius of the spread around int(max_seq_len * load_factor)
+        that the offset differences should be randomly sampled from.
+        Default: 0.1.
+
+    Returns
+    -------
+    torch.Tensor
+        The resulting rank-1 Tensor of offsets.
+    """
+    assert 0 <= load_factor <= 1
+    assert 0 <= spread_radius <= 1
+
+    if load_factor < 1:
+        spread = int(max_seq_len * spread_radius)
+        mean = int(max_seq_len * load_factor)
+        lengths = [
+            mean + random.randint(-spread, spread + 1) for _ in range(batch_size)
+        ]
+        lengths = [max(min(L, max_seq_len), 0) for L in lengths]
+    else:
+        lengths = [max_seq_len] * batch_size
+
+    offsets = [0]
+    for length in lengths:
+        offsets.append(offsets[-1] + length)
+
+    torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+    return torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+
+
+def batched_dense_vec_jagged_2d_mul_ref(
+    vectors: torch.Tensor,  # [B, H, N]
+    matrices: torch.Tensor,  # [sum_B(N_B), H, D]
+    offsets: torch.Tensor,  # [B + 1]
+):
+    """
+    Reference function for fbgemm batched_dense_vec_jagged_2d_mul.
+    https://pytorch.org/FBGEMM/python-api/jagged_tensor_ops.html#torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul
+
+    Parameters
+    ----------
+    vecrors: torch.Tensor
+        Batch of vectors of the shape [B, H, N]. N is the maximum
+        sequence length in the jagged Tensor `matrices`. Each vector
+        in the batch is N-sized. The effective batch size is B * H.
+    matrices: torch.Tensor
+        Batch of jagged matrices (in a jagged Tensor) of the shape
+        [sum_B(N_B), H, D]. The first dimension encodes the batch
+        B of sequneces of variable length: from 0 to N. The matrices
+        have variable number of rows (determined by the variable
+        sequence lengths) and fixed number of columns: D. H is a
+        factor of the effective batch size, just pulled to the
+        right of the sum_B(N_B) dimension.
+    offsets: torch.Tensor
+        Rank-1 offsets Tensor describing the single jagged dimension
+        (from 0 to N) in the jagged `matrices`.
+
+    Returns
+    -------
+    torch.Tensor
+        Batch of vectors resulting from the batched vector x jagged
+        matrix multiplication. Shape: [B, H, D] (as N in the `vectors`
+        is contracted with the variable sequence length encoded in the
+        sum_B(N_B) dimension of the `matrices`).
+    """
+    assert vectors.dim() == 3
+    B, H, N = vectors.size()
+
+    assert matrices.dim() == 3
+    assert matrices.size(1) == H
+    D = matrices.size(2)
+
+    assert offsets.dim() == 1
+    assert offsets.size(0) == B + 1
+
+    # pad the jagged matrices with zeros
+    padded_matrices = jagged_to_dense(
+        jagged=matrices,
+        offsets_list=[offsets],
+        dense_shape=[B, N, H, D],
+        padding_value=0.0,
+    )  # [B, N, H, D]
+
+    return torch.matmul(
+        vectors.unsqueeze(dim=2),  # [B, H, 1, N]
+        padded_matrices.permute([0, 2, 1, 3]),  # [B, H, N, D]
+    ).squeeze(
+        dim=2
+    )  # [B, H, D]
+
+
+def add_jagged_dense_ref(
+    jagged: torch.Tensor,
+    offsets_list: List[torch.Tensor],
+    dense: torch.Tensor,
+    jagged_max_shape: List[int] = None,
+) -> torch.Tensor:
+    """The reference function for jagged / dense elementwise add."""
+    if jagged_max_shape is None:
+        jagged_max_shape = dense.shape
+
+    assert len(jagged.shape) + len(offsets_list) >= len(dense.shape)
+    assert len(jagged_max_shape) == len(jagged.shape) + len(offsets_list)
+
+    return dense_to_jagged(
+        dense=(
+            dense
+            + jagged_to_dense(
+                jagged=jagged,
+                offsets_list=offsets_list,
+                dense_shape=jagged_max_shape,
+                padding_value=0.0,
+            )
+        ),
+        offsets_list=offsets_list,
+        padding_value=-1.0,
+    )
diff --git a/python/aitemplate/testing/profile.py b/python/aitemplate/testing/profile.py
new file mode 100644
index 000000000..d4ad64a54
--- /dev/null
+++ b/python/aitemplate/testing/profile.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Torch module profiling utility.
+"""
+import logging
+from operator import itemgetter
+from typing import Callable, List, Tuple
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def profile_callable(
+    func: Callable,
+    cache_flush_slab: torch.Tensor,
+    n_iter: int,
+) -> Tuple[List[int], List[int]]:
+    """
+    Profile the callable and return the device and wall time for each iteration.
+    We assume the iterations happen sequentially, not concurrently.
+    Example usage:
+    .. code-block:: python
+        x = torch.randn((4096, 2048), device='cuda')
+        y = torch.randn((8192, 2048), device='cuda')
+        xy = torch.empty((4096, 8192), device='cuda')
+        slab = torch.empty(40 * 1024 * 1024, dtype=torch.int8, device='cuda')
+        def _f():
+            torch.nn.functional.linear(x, y, out=xy)
+        profile_callable(_f, slab, 100)
+    Parameters
+    ----------
+    func: Callable
+        The callable to profile.
+    cache_flush_slab: torch.Tensor
+        A slab of GPU memory. We flush the device L2 cache by filling the slab.
+    n_iter: int
+        The number of iterations to call the callable.
+    Returns
+    -------
+        device_times: List[int]
+            Sum of the kernel device times (µs) for each iteration.
+        wall_times: List[int]
+            Times (µs) from the start of the first kernel
+            until the end of the last kernel for each iteration.
+    """
+    if n_iter <= 0:
+        return [], []
+    # warmup
+    for _ in range(5):
+        func()
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CUDA],
+        record_shapes=True,
+    ) as prof:
+        for _ in range(n_iter):
+            cache_flush_slab.fill_(3.7)
+            func()
+    # log the invoked kernels
+    results = prof.key_averages().table(
+        sort_by="self_cuda_time_total",
+        max_name_column_width=120,
+        row_limit=-1,
+    )
+    logger.info(results)
+
+    events = [
+        {
+            "name": e.name,
+            "cuda_time": e.cuda_time,
+            "start": e.time_range.start,
+            "end": e.time_range.end,
+        }
+        for e in prof.events()
+        if e.cuda_time != 0
+    ]
+
+    sorted_events = sorted(events, key=itemgetter("start"))
+    assert 0 == len(sorted_events) % n_iter
+    n_groups = len(sorted_events) // n_iter
+    # in each group (corresponding to a profiling iteration),
+    # skip measuring the first kernel, which is the l2 cache flush
+    event_groups = [g[1:] for g in zip(*([iter(sorted_events)] * n_groups))]
+    logger.info(
+        f"First kernel sequence: {list(map(itemgetter('name'), event_groups[0]))}"
+    )
+    device_times = [sum(map(itemgetter("cuda_time"), g)) for g in event_groups]
+    wall_times = [
+        g[-1]["end"] - g[0]["start"] if len(g) > 0 else 0 for g in event_groups
+    ]
+    return device_times, wall_times
diff --git a/python/aitemplate/testing/test_utils.py b/python/aitemplate/testing/test_utils.py
index b229541f3..82c1fe95d 100644
--- a/python/aitemplate/testing/test_utils.py
+++ b/python/aitemplate/testing/test_utils.py
@@ -15,38 +15,190 @@
 """
 Utils for unit tests.
 """
-from typing import Any, Dict, List
+import contextlib
+import itertools
+import os
+import unittest
+from enum import Enum
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
 
 import torch
 
 from aitemplate.compiler.base import IntImm, IntVar, Operator, Tensor
+from aitemplate.compiler.dtype import normalize_dtype
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.testing.detect_target import detect_target
 from aitemplate.utils.graph_utils import get_sorted_ops
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
-DTYPE_TO_TORCH_DTYPE: Dict[str, torch.dtype] = {
-    "float16": torch.half,
-    "float": torch.float,
-    "int": torch.int,
+class TestEnv(Enum):
+    CUDA_LESS_THAN_SM80 = 1
+    CUDA_SM80 = 2
+    CUDA_SM90 = 3
+    ROCM = 100
+
+
+def _ROCM_filter(method_name: str) -> bool:
+    return method_name.endswith("rocm")
+
+
+def _SM80_filter(method_name: str) -> bool:
+    return method_name.endswith("bf16") or method_name.endswith("sm80")
+
+
+def _SM90_filter(method_name: str) -> bool:
+    return method_name.endswith("sm90")
+
+
+_TEST_ENV_TO_FILTER_METHOD: Dict[str, Callable[[str], bool]] = {
+    TestEnv.CUDA_LESS_THAN_SM80: (
+        lambda method_name: not (
+            _SM80_filter(method_name)
+            or _SM90_filter(method_name)
+            or _ROCM_filter(method_name)
+        )
+    ),
+    TestEnv.CUDA_SM80: _SM80_filter,
+    TestEnv.CUDA_SM90: _SM90_filter,
+    TestEnv.ROCM: _ROCM_filter,
+}
+
+
+# maps each test env (key) to the set of all test envs compatible with
+# it (value). "compatible" means that a tests that can run in *any*
+# env in the value Set[TestEnv] can also run in the key TestEnv.
+_COMPATIBLE_TEST_ENVS: Dict[TestEnv, Set[TestEnv]] = {
+    TestEnv.ROCM: {TestEnv.ROCM},
+    TestEnv.CUDA_LESS_THAN_SM80: {TestEnv.CUDA_LESS_THAN_SM80},
+    TestEnv.CUDA_SM80: {TestEnv.CUDA_LESS_THAN_SM80, TestEnv.CUDA_SM80},
+    TestEnv.CUDA_SM90: {
+        TestEnv.CUDA_LESS_THAN_SM80,
+        TestEnv.CUDA_SM80,
+        TestEnv.CUDA_SM90,
+    },
 }
 
 
-def dtype_to_torch_dtype(dtype):
-    if dtype is None:
-        return None
-    torch_dtype = DTYPE_TO_TORCH_DTYPE.get(dtype)
-    if torch_dtype is None:
-        raise RuntimeError("Unsupported dtype: {}".format(dtype))
-    return torch_dtype
+def _get_test_env(target) -> str:
+    test_env = ""
+    if target.name() == "cuda":
+        if int(target._arch) < 80:
+            test_env = TestEnv.CUDA_LESS_THAN_SM80
+        elif int(target._arch) == 80:
+            test_env = TestEnv.CUDA_SM80
+        elif int(target._arch) == 90:
+            test_env = TestEnv.CUDA_SM90
+        else:
+            raise RuntimeError(
+                f"Unknown test env, target: {target.name}, {target._arch}"
+            )
+    elif target.name() == "rocm":
+        test_env = TestEnv.ROCM
+    else:
+        raise RuntimeError(f"Unknown test env, target: {target.name}, {target._arch}")
+    if test_env not in _TEST_ENV_TO_FILTER_METHOD:
+        raise RuntimeError(f"{test_env=} not defined in _TEST_ENV_TO_FILTER_METHOD")
+    if test_env not in _COMPATIBLE_TEST_ENVS:
+        raise RuntimeError(f"{test_env=} not defined in _COMPATIBLE_TEST_ENVS")
+    return test_env
+
+
+def _test_runnable_in_env(test_name: str, env: TestEnv) -> bool:
+    """Whether the test with the given name can run in the given test env."""
+    for test_env in _COMPATIBLE_TEST_ENVS[env]:
+        if _TEST_ENV_TO_FILTER_METHOD[test_env](test_name):
+            return True
+    return False
+
+
+def filter_test_cases_by_params(params: Dict[TestEnv, List[Tuple[Any]]]):
+    """Filters test cases to run by given params.
+
+    In CI, only the params corresponding to the CI's test env are kept.
+    Outside CI, the params corresponding to any test env compatible with
+    the local test env are kept.
+    """
+    target = detect_target()
+    test_env = _get_test_env(target)
+    input_ = (
+        params.get(test_env, [])
+        if target.in_ci_env()
+        else list(
+            itertools.chain.from_iterable(
+                values
+                for env, values in params.items()
+                if env in _COMPATIBLE_TEST_ENVS[test_env]
+            )
+        )
+    )
+    return {
+        "input": input_,
+        "skip_on_empty": True,
+    }
+
+
+def filter_test_cases_by_test_env(cls: Type[unittest.TestCase]):
+    """Filters test cases to run by test case names implicitly.
+
+    In CI, only the test cases filtered by the CI's test env are kept.
+    Outside CI, the test cases filtered by any test env compatible with
+    the local test env are kept.
+    """
+    target = detect_target()
+    test_env = _get_test_env(target)
+    for attr in list(cls.__dict__.keys()):
+        if attr.startswith("test_"):
+            test_name = attr
+            if target.in_ci_env():
+                if not _TEST_ENV_TO_FILTER_METHOD[test_env](test_name):
+                    delattr(cls, attr)
+            elif not _test_runnable_in_env(test_name, test_env):
+                delattr(cls, attr)
+
+
+@contextlib.contextmanager
+def env_variables(**kwargs):
+    """CM for temporarily setting (or removing) environment variables."""
+    old_values = {name: os.environ.get(name, None) for name in kwargs}
+
+    try:
+        for name, new_value in kwargs.items():
+            if new_value is not None:
+                os.environ[name] = str(new_value)
+            elif name in os.environ:
+                os.environ.pop(name)
+        yield
+    finally:
+        for name, old_value in old_values.items():
+            if old_value is not None:
+                os.environ[name] = old_value
+            elif name in os.environ:
+                os.environ.pop(name)
+
+
+def _get_torch_tensor(torch_fn, shape, dtype):
+    dtype = normalize_dtype(dtype)
+    return torch_fn(shape, device="cuda", dtype=string_to_torch_dtype(dtype))
+
+
+def get_random_torch_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.randn, shape, dtype)
+
 
+def get_torch_empty_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.empty, shape, dtype)
 
-def get_random_torch_tensor(shape, dtype):
-    if dtype == "float16":
-        return torch.randn(shape).cuda().half()
-    if dtype == "float":
-        return torch.randn(shape).cuda().float()
-    if dtype == "int":
-        return torch.randn(shape).cuda().int()
-    raise RuntimeError("unsupported dtype: {}".format(dtype))
+
+def get_torch_zeros_tensor(shape, dtype="float16"):
+    return _get_torch_tensor(torch.zeros, shape, dtype)
+
+
+def get_torch_full_tensor(shape, fill_value, dtype="float16"):
+    dtype = normalize_dtype(dtype)
+    return torch.full(
+        shape, fill_value, device="cuda", dtype=string_to_torch_dtype(dtype)
+    )
 
 
 def has_op(sorted_ops: List[Operator], op_name: str) -> bool:
@@ -70,10 +222,12 @@ def count_ops(sorted_ops: List[Operator], op_name: str):
     return count
 
 
-def gen_input_tensor(shape: List[Any], name: str = None) -> Tensor:
+def gen_input_tensor(
+    shape: List[Any], dtype: str = "float16", name: Optional[str] = None
+) -> Tensor:
     tensor = Tensor(
         shape=shape,
-        dtype="float16",
+        dtype=dtype,
         name=name,
         is_input=True,
     )
@@ -103,3 +257,59 @@ def get_shape(shape: List[IntVar], dim_to_value_dict: Dict[str, int]):
         for dim in shape
     ]
     return res
+
+
+def epilogue_math_name_to_torch_fn(epilogue_math_name: str) -> Callable[[Any], Any]:
+    if epilogue_math_name == "Identity":
+        return lambda x: x
+    elif epilogue_math_name == "Sigmoid":
+        return torch.sigmoid
+    elif epilogue_math_name == "SiLu":
+        return torch.nn.functional.silu
+    elif epilogue_math_name == "ReLu":
+        return torch.nn.functional.relu
+    elif epilogue_math_name == "Tanh":
+        return torch.nn.functional.tanh
+    else:
+        raise NotImplementedError(f"Unsupported {epilogue_math_name=}!")
+
+
+def get_attn_mask_per_causal_type(
+    m: int, n: int, causal_type: CausalType, torch_dtype: str
+) -> torch.Tensor:
+    if causal_type == CausalType.NO_CAUSAL:
+        invalid_attn_mask = torch.ones((m, n), dtype=torch_dtype, device="cuda")
+    elif causal_type == CausalType.LOWER_LEFT_EMPTY:
+        invalid_attn_mask: torch.Tensor = 1.0 - torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch.bool,
+                device="cuda",
+            )
+        ).fill_diagonal_(False).to(torch_dtype)
+    elif causal_type == CausalType.UPPER_RIGHT_EMPTY:
+        invalid_attn_mask: torch.Tensor = torch.tril(
+            torch.ones(
+                (m, n),
+                dtype=torch_dtype,
+                device="cuda",
+            )
+        )
+    else:
+        raise NotImplementedError(f"Unsupported {causal_type=}!")
+    return invalid_attn_mask
+
+
+def init_random_weights(m):
+    if hasattr(m, "weight"):
+        torch.nn.init.uniform_(m.weight)
+    elif (
+        type(m) == torch.nn.Sequential
+        or type(m) == torch.nn.ModuleList
+        or type(m) == torch.nn.SiLU
+        or type(m) == torch.nn.Dropout
+        or type(m) == torch.nn.Identity
+    ):
+        pass
+    else:
+        print("Passed root module: " + str(type(m)))
diff --git a/python/aitemplate/utils/__init__.py b/python/aitemplate/utils/__init__.py
index b41eabd98..fd1123eef 100644
--- a/python/aitemplate/utils/__init__.py
+++ b/python/aitemplate/utils/__init__.py
@@ -13,15 +13,4 @@
 #  limitations under the License.
 #
 
-# flake8: noqa
-
-from . import (
-    alignment,
-    graph_utils,
-    logger,
-    markdown_table,
-    shape_utils,
-    tensor_utils,
-    torch_utils,
-    visualization,
-)
+# Let's keep this file empty to resolve circular import issues
diff --git a/python/aitemplate/utils/alignment.py b/python/aitemplate/utils/alignment.py
index d171a8cb8..014d3e8ec 100644
--- a/python/aitemplate/utils/alignment.py
+++ b/python/aitemplate/utils/alignment.py
@@ -15,22 +15,60 @@
 """
 Util functions to handle alignment.
 """
-# Currently read4, add2 is best for both backend, so two backend seems identical.
-# They may diverge when we got deeper understanding / further optimization.
-ALIGNMENTS = [
-    8,
-    4,
-    2,
-    1,
-]
 
+from typing import List
 
-def find_max_alignment(number: int) -> int:
+from aitemplate.compiler.dtype import normalize_dtype
+
+
+# FIXME: These alignment constraints are for cutlass/ck. We should consider
+# to refine this part for other backends.
+def get_alignments(dtype: str) -> List[int]:
+    """
+    Return all of the valid alignment values for the dtype.
+    """
+    dtype = normalize_dtype(dtype)
+    if dtype in ("float16", "bfloat16"):
+        return [8, 4, 2, 1]
+    elif dtype in ("float", "float32"):
+        return [4, 2, 1]
+    else:
+        raise NotImplementedError(f"unsupported {dtype=} for alignments")
+
+
+def find_max_alignment(number: int, dtype: str) -> int:
     """
     Return the first alignment value that meets the alignment requirement
     for accessing the `number` of elements. This is dtype dependent.
     """
-    for alignment in ALIGNMENTS:
+    alignments = get_alignments(dtype)
+    for alignment in alignments:
         if number % alignment == 0:
             return alignment
     return 1
+
+
+def find_max_alignment_from(numbers: List[int], dtype: str) -> int:
+    """
+    Return the max alignment value that is valid for all the numbers.
+    """
+    alignments = get_alignments(dtype)
+    for alignment in alignments:
+        if all(number % alignment == 0 for number in numbers):
+            return alignment
+    return 1
+
+
+def valid_alignment(align: int, dtype: str) -> bool:
+    """
+    Return True if the given align value is legitimate for the dtype.
+    """
+    dtype = normalize_dtype(dtype)
+    # 2-elem-alignment is required by fp16, because async.copy needs at least 32
+    # bits. For fp32 dtype values, 1-elem-alignment is valid.
+    if dtype in ("float16", "bfloat16"):
+        return align % 2 == 0
+    elif dtype in ("float", "float32"):
+        return True
+    else:
+        raise NotImplementedError(f"unsupported {dtype=} for valid_alignment")
diff --git a/python/aitemplate/utils/debug_settings.py b/python/aitemplate/utils/debug_settings.py
new file mode 100644
index 000000000..39e374a1d
--- /dev/null
+++ b/python/aitemplate/utils/debug_settings.py
@@ -0,0 +1,47 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Debug settings
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+from aitemplate.utils import environ
+
+
+@dataclass
+class AITDebugSettings:
+    """
+    This class contains the options for configuring debug settings
+    Arguments:
+    check_all_nan_and_inf : bool (default: False)
+        Whether or not to check this tensor is nan or inf during runtime.
+    check_all_outputs : bool (default: False)
+        Whether or not to print this tensor's value out during runtime.
+    gen_profiler_annotation : bool (default: False)
+        Whether or not to add profile annotation primitives when doing codegen.
+        (e.g. NVTX for CUDA and rocTX for AMD) Currently only supports NVIDIA.
+    dump_ait_to_py: str, optional
+        The path where the AIT graph is dumped into a .py file.
+    gen_standalone : bool (default: False)
+        Generate a standalone executable for the model
+    """
+
+    check_all_nan_and_inf: bool = False
+    check_all_outputs: bool = False
+    gen_profiler_annotation: bool = False
+    dump_ait_to_py: Optional[str] = None
+    gen_standalone: bool = environ.enable_standalone_exe_generation()
diff --git a/python/aitemplate/utils/environ.py b/python/aitemplate/utils/environ.py
new file mode 100644
index 000000000..160b56a1e
--- /dev/null
+++ b/python/aitemplate/utils/environ.py
@@ -0,0 +1,279 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+A common place for holding AIT-related env control variables
+"""
+import logging
+import os
+from typing import Optional
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def get_compiler_opt_level() -> str:
+    # The reason: it is typical in our situation that an option
+    # --optimize <level> (-Ox) is for a HOST compiler. And -O3 does
+    # literally nothing except for the enormous compilation time.
+    #
+    # So, it is safe to allow users to override this option in order
+    # to significantly speedup the computations / testing, especially
+    # for very large models.
+    compiler_opt = os.getenv("AIT_COMPILER_OPT", "-O3")
+
+    return compiler_opt
+
+
+def use_fast_math() -> str:
+    """
+    Whether the fast math option should be used for the device code generation.
+    Fast math implies the use of approximate math operations (say,
+    a division operation), allowing to gain speed at the cost of accuracy.
+    Default value is "1".
+    """
+    return os.getenv("AIT_USE_FAST_MATH", "1") == "1"
+
+
+def enable_cuda_lto() -> bool:
+    """
+    nvcc will use LTO flags during compilation
+    Default value is "0".
+    """
+    return os.getenv("AIT_ENABLE_CUDA_LTO", "0") == "1"
+
+
+def force_profiler_cache() -> bool:
+    """
+    Force the profiler to use the cached results. The profiler will throw
+    a runtime exception if it cannot find cached results. This env may be
+    useful to capture any cache misses due to cache version updates or
+    other relevant code changes.
+    """
+    force_cache = os.environ.get("AIT_FORCE_PROFILER_CACHE", None) == "1"
+    if force_cache:
+        assert (
+            os.environ.get("FORCE_PROFILE", None) != "1"
+        ), "cannot specify both AIT_FORCE_PROFILER_CACHE and FORCE_PROFILE"
+    return force_cache
+
+
+def time_compilation() -> bool:
+    """
+    When enabled, time each make command at compilation time.
+    This helps us doing compilation time analysis.
+    Requires to install "time".
+    """
+    return os.getenv("AIT_TIME_COMPILATION", "0") == "1"
+
+
+def shorten_tensor_names_for_plots() -> bool:
+    """
+    When enabled, long tensor names will be replaced with a hash string,
+    making the graph representation significantly simpler.
+    """
+    return os.getenv("AIT_PLOT_SHORTEN_TENSOR_NAMES", "0") == "1"
+
+
+def ait_build_cache_dir() -> Optional[str]:
+    """
+    When set to a non-empty string, cache the build artifacts
+    below this directory for significantly faster builds.
+
+    See aitemplate.backend.build_cache
+
+    Returns:
+        Optional[str]: Value of AIT_BUILD_CACHE_DIR environment variable,
+        or None if not set.
+    """
+    return os.environ.get("AIT_BUILD_CACHE_DIR", None)
+
+
+def ait_build_cache_skip_percentage() -> int:
+    """
+    When set to a non-empty string, and if AIT_BUILD_CACHE_DIR
+    is set, the build cache will be skipped randomly with
+    a probability correspinding to the specified percentage
+
+    Returns:
+        int: Integer value of AIT_BUILD_CACHE_SKIP_PERCENTAGE environment variable,
+        or 5 if not set.
+    """
+    return int(os.environ.get("AIT_BUILD_CACHE_SKIP_PERCENTAGE", "30"))
+
+
+def ait_build_cache_skip_profiler() -> bool:
+    """
+    boolean value of AIT_BUILD_CACHE_SKIP_PROFILER environment variable.
+    Will return True if that variable is not set, if it is equal to "0",
+    an empty string or "False" ( case insensitive ). Will return True
+    in all other cases.
+    """
+    ret = os.environ.get("AIT_BUILD_CACHE_SKIP_PROFILER", "1")
+    if ret is None or ret == "" or ret == "0" or ret.lower() == "false":
+        return False
+    return True
+
+
+def ait_build_cache_max_mb() -> int:
+    """
+    boolean value of AIT_BUILD_CACHE_MAX_MB environment variable.
+    This determines the maximum size of the artifact data to be cached
+    in MB. For larger (raw, uncompressed) data the build cache will
+    be skipped. Defaults to 30.
+    """
+    return int(os.environ.get("AIT_BUILD_CACHE_MAX_MB", "30"))
+
+
+def allow_cutlass_sm90_kernels() -> bool:
+    """
+    Whether the SM90 CUTLASS kernels should to be considered
+    alongside the SM80 CUTLASS kernels on the CUDA arch 90
+    (for the CUDA back-end of the GEMM ops). Default: False.
+    """
+    return (
+        force_cutlass_sm90_kernels()
+        or os.getenv("AIT_ALLOW_CUTLASS_SM90_KERNELS", "0") == "1"
+    )
+
+
+def force_cutlass_sm90_kernels() -> bool:
+    """
+    Whether only the SM90 CUTLASS kernels (and not the SM80 ones)
+    should be considered on the CUDA arch 90 (for the CUDA
+    back-end of the GEMM ops). Default: False.
+    """
+    return os.getenv("AIT_FORCE_CUTLASS_SM90_KERNELS", "0") == "1"
+
+
+def multistream_mode() -> int:
+    """
+    Multi-stream mode. 0 - no multistream. 1 - simple multistream.
+    Default: 0.
+    """
+
+    # temporarily override it in order to test
+    return int(os.getenv("AIT_MULTISTREAM_MODE", "0"))
+
+
+def multistream_additional_streams() -> int:
+    """
+    Number of extra streams in multi-stream mode.
+
+    This option is independent from AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS.
+
+    For example, say, there are 100 ops that can be run in parallel.
+
+    Example 1: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=100.
+    In this case 5 streams will be used (1 base and 4 extra),
+    every stream gets 20 operators and no inter-stream barriers are used.
+    Memory planning is done for 100 parallel ops.
+
+    Example 2: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=5.
+    In this case 5 streams will be used (1 base and 4 extra),
+    there will be 20 waves separated by inter-stream barriers,
+    every stream gets 1 operator for every wave.
+    Memory planning is done for 20 waves of 5 parallel ops each.
+
+    """
+    return int(os.getenv("AIT_MULTISTREAM_EXTRA_STREAMS", "4"))
+
+
+def multistream_max_mem_parallel_ops() -> int:
+    """
+    Maximum number of parallel operators used in memory planning
+    for simple multi-stream mode.
+    Larger value imply higher level of possible parallelism, but
+    higher memory allocations.
+
+    This option is independent from AIT_MULTISTREAM_EXTRA_STREAMS.
+
+    For example, say, there are 100 ops that can be run in parallel.
+
+    Example 1: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=100.
+    In this case 5 streams will be used (1 base and 4 extra),
+    every stream gets 20 operators and no inter-stream barriers are used.
+    Memory planning is done for 100 parallel ops.
+
+    Example 2: AIT_MULTISTREAM_EXTRA_STREAMS=4 and AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS=5.
+    In this case 5 streams will be used (1 base and 4 extra),
+    there will be 20 waves separated by inter-stream barriers,
+    every stream gets 1 operator for every wave.
+    Memory planning is done for 20 waves of 5 parallel ops each.
+    """
+    # unlimited by default
+    return int(os.getenv("AIT_MULTISTREAM_MAX_MEM_PARALLEL_OPS", "99999999"))
+
+
+def is_cmake_compilation() -> bool:
+    """
+    When enabled, compiles the model via invoking CMake rather than
+    invoking make directly.
+    """
+
+    # todo: replace with more builders?
+    return os.getenv("AIT_USE_CMAKE_COMPILATION", "0") == "1"
+
+
+def enable_standalone_exe_generation() -> bool:
+    """
+    Whether to generate standalone binaries for AIT build directories.
+    Defaults to False.
+    """
+    return os.getenv("AIT_ENABLE_STANDALONE", "0") == "1"
+
+
+def enable_ptxas_info():
+    """
+    Whether to keep intermediate nvcc output files (including ptxas assembly) generated by
+    nvcc, and generate verbose ptxas generation logs. Defaults to False.
+    """
+    return os.getenv("AIT_ENABLE_PTXAS_INFO", "0") == "1"
+
+
+def enable_include_from_sourcetree():
+    """
+    Whether to include header files from source tree when building AIT model instead of
+    placing them in a temp dir. Defaults to False. Only works with FBCUDA target when
+    doing development / in-place builds.
+    """
+    return os.getenv("AIT_ENABLE_INCLUDE_FROM_SOURCETREE", "0") == "1"
+
+
+def get_cuda_nvcc_debug_level():
+    """
+    Return level of CUDA debug information. Default to no debug info.
+    backed by env var AIT_CUDA_DEBUG_LEVEL. Which may either be a string
+    which is directly passed through to nvcc on the commandline, or an
+    integer (as String) from 0 to 2 with the following meaning:
+        - 0: No debug info ( default )
+        - 1: Line information. Good for stack traces and profiling. Optimizations can be enabled.
+        - 2: Full debug information.
+
+    WARNING:
+    Level 2 disables all compiler optimizations,
+    regardless of what else is passed as optimization level.
+    """
+    level = os.getenv("AIT_CUDA_DEBUG_LEVEL", "0")
+    return level
+
+
+def enable_cuda_source_navigation_fix():
+    """
+    When this flag is enabled, the FBCUDA Target will copy every *.cu file in build dirs into
+    a corresponding *.cu.h file and create a *.cu file which just #include's this file.
+    This fixes code navigation issues in some IDE's which don't treat .cu files as C++
+    files and disable code navigation.
+    """
+    return os.getenv("AIT_ENABLE_CUDA_SOURCE_NAVIGATION_FIX", "0") == "1"
diff --git a/python/aitemplate/utils/graph_utils.py b/python/aitemplate/utils/graph_utils.py
index 600bf14f8..abe89a2f9 100644
--- a/python/aitemplate/utils/graph_utils.py
+++ b/python/aitemplate/utils/graph_utils.py
@@ -12,13 +12,26 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import json
+import logging
 import os
-from typing import Any, List
+from collections import deque
+from pathlib import Path
+from typing import Any, Dict, List, Set, Union
 
-from aitemplate.utils import logger
+from aitemplate.utils.misc import is_debug
+from aitemplate.utils.visualization import plot_graph
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def get_sorted_ops(tensors) -> List[Any]:
+    """
+    Produces the exact execution sequence of operators.
+    This matches backend/codegen.py, ModelContainerGenerator.append_all_tensors()
+    """
+
     from aitemplate.compiler.base import Tensor
 
     visited = set()
@@ -26,7 +39,7 @@ def get_sorted_ops(tensors) -> List[Any]:
     if isinstance(tensors, Tensor):
         tensors = [tensors]
     for tensor in tensors:
-        for src_op in tensor._attrs["src_ops"]:
+        for src_op in tensor.src_ops():
             if src_op in visited:
                 continue
             visited.add(src_op)
@@ -44,6 +57,23 @@ def sorted_graph_debug_str(tensors) -> str:
     return "Tensors: {}\n\nOperators: {}\n\n".format(tensor_str, op_str)
 
 
+def sorted_graph_debug_json(tensors) -> str:
+    from aitemplate.compiler.base import Tensor
+    from aitemplate.utils.json_utils import gen_unique_op_names, GraphJsonEncoder
+
+    if isinstance(tensors, Tensor):
+        tensors = [tensors]
+
+    json_dict = {}
+    json_dict["Tensors"] = tensors
+    json_dict["Operators"] = get_sorted_ops(tensors)
+
+    op_names = gen_unique_op_names(tensors)
+    encoder = GraphJsonEncoder(op_names, indent=2)
+
+    return encoder.encode(json_dict)
+
+
 def sorted_graph_pseudo_code(tensors, with_shape=True) -> str:
     from aitemplate.compiler.base import Tensor
 
@@ -62,15 +92,402 @@ def sorted_op_pseudo_code(ops, with_shape=True) -> str:
     return op_str
 
 
-def dump_graph_debug_str_to_file(tensors, workdir, name):
-    if logger.is_debug():
+def dump_graph_debug_str_to_file(tensors, workdir, name, file_with_time_profiles=None):
+    if is_debug():
         # Dump graph and pseudo code for debug only
-        prefix = os.path.join(workdir, name)
+        debug_path = workdir + "/debug"
+        if not os.path.exists(debug_path):
+            os.makedirs(debug_path)
+        prefix = os.path.join(debug_path, name)
         graph_path = prefix + "_graph.txt"
+        graph_json_path = prefix + "_graph.json"
         pseudo_code_path = prefix + "_pseudo_code.txt"
+        graph_visual_path = prefix + "_graph_vis.html"
         with open(graph_path, "w") as f:
             f.write(sorted_graph_debug_str(tensors))
-            logger.debug(__file__, f"Dumped {name} graph to {graph_path}")
+            _LOGGER.debug(f"Dumped {name} graph to {graph_path}")
+        with open(graph_json_path, "w") as f:
+            f.write(sorted_graph_debug_json(tensors))
+            _LOGGER.debug(f"Dumped {name} graph to {graph_json_path}")
         with open(pseudo_code_path, "w") as f:
             f.write(sorted_graph_pseudo_code(tensors))
-            logger.debug(__file__, f"Dumped {name} pseudo code to {pseudo_code_path}")
+            _LOGGER.debug(f"Dumped {name} pseudo code to {pseudo_code_path}")
+        plot_graph(tensors, graph_visual_path, file_with_time_profiles)
+        _LOGGER.debug(f"Dumped {name} visualization to {graph_visual_path}")
+
+
+class TimestampTracking:
+    def __init__(
+        self, execution_start: float = 0, duration: float = 0, execution_order: int = 0
+    ):
+        self.execution_order = execution_order
+        self.execution_start = execution_start
+        self.duration = duration
+
+    @property
+    def execution_end(self):
+        return self.execution_start + self.duration
+
+
+class ProfiledTimeStatistics:
+    def __init__(self):
+        # Dict[Operator, float]
+        self.op_durations = {}
+
+        # Dict[Operator, TimestampTracking]
+        self.op_parallel_trackers = {}
+        # Dict[Operator, TimestampTracking]
+        self.op_sequential_trackers = {}
+
+        # Dict[Tensor, TimestampTracking]
+        self.tensor_parallel_trackers = {}
+        # Dict[Tensor, TimestampTracking]
+        self.tensor_sequential_trackers = {}
+
+        # 0.7 percentile of op times
+        self.duration_p70 = 0.0
+        # 0.9 percentile of op times
+        self.duration_p90 = 0.0
+        # 0.95 percentile of op times
+        self.duration_p95 = 0.0
+        # max time spent among operators
+        self.duration_max = 0.0
+        # total time spent by operators
+        self.total_duration = 0.0
+
+
+def _load_op_durations_from_file(input: Union[str, Path]) -> Dict[str, float]:
+    """
+    Loads benchmarking results produced with a profiler from a .json file.
+    """
+
+    if isinstance(input, str):
+        input_path = Path(input)
+    elif isinstance(input, Path):
+        input_path = input
+    else:
+        raise ValueError("str or Path is needed as an input argument")
+
+    # load the file with the profile.
+    with input_path.open("r") as f:
+        perf_per_op_str = f.read()
+
+    # parse file
+    perf_per_op_str_dict = json.loads(perf_per_op_str)
+
+    op_durations: Dict[str, float] = {}
+    for op_name, op_data in perf_per_op_str_dict.items():
+        op_durations[op_name] = op_data["ms_per_iter"]
+
+    # done
+    return op_durations
+
+
+def track_graph_timings(
+    tensors, inputv: Union[str, Path, Dict[str, float]]
+) -> ProfiledTimeStatistics:
+    """
+    Traverses the graph of tensors and uses the statistics from the profiler
+    to evaluate execution times in case of sequential execution (1 stream)
+    and parallel execution (unlimited number of streams).
+
+    The parallel execution tracking works in the following way.
+    1. Input tensors and constant tensors are marked as processed.
+    2. Other tensors are marked as unprocessed.
+    3. All operators are marked as unprocessed.
+    4. Repeat
+    4.1. Searches for unprocessed operators whose input tensors are marked
+    as processed and "executes" ones, then mark corresponding output tensors as processed.
+    4.2. Stop if the number of processed operators on step 4.1 is zero
+    5. If the total number of unprocessed operators is not zero, then the graph is invalid.
+
+    Parameters
+    ----------
+    tensors : List[Tensor]
+        a list of output Tensors of AIT graph
+    inputv : Union[str, Path, Dict[str, float]]
+        str or Path: a path to .json file with the results generated by a profiling procedure
+        Dict[str, float]: time costs of operators (key is op._attrs["original_name"])
+    """
+
+    from aitemplate.compiler.base import Operator, Tensor
+
+    output = ProfiledTimeStatistics()
+
+    # the exact sequence of non-constant tensors that need to be evaluated
+    #   within a single execution stream.
+    unprocessed_tensors: List[Tensor] = []
+
+    # Sequence_of_ops contains an exact execution sequence of ops
+    #   within a single execution stream.
+    # Similar to graph_utils.py, get_sorted_ops() call.
+    sequence_of_ops: List[Operator] = []
+    visited_ops: Set[Operator] = set()
+
+    for tensor in tensors:
+        src_ops = tensor.src_ops()
+
+        if len(src_ops) == 0:
+            # This tensor depends on no operator.
+            # So, add the final statistics for it.
+            output.tensor_parallel_trackers[tensor] = TimestampTracking()
+            output.tensor_sequential_trackers[tensor] = TimestampTracking()
+        else:
+            for op in src_ops:
+                if op not in visited_ops:
+                    visited_ops.add(op)
+                    sequence_of_ops.append(op)
+
+            # this tensor needs to be evaluated
+            unprocessed_tensors.append(tensor)
+
+    # ok, we've got ops. Load the file with the profile.
+    op_durations: Dict[str, float] = {}
+    if isinstance(inputv, str) or isinstance(inputv, Path):
+        # str or Path
+        op_durations = _load_op_durations_from_file(inputv)
+    elif (
+        isinstance(inputv, dict)
+        and all(isinstance(x, str) for x in inputv.keys())
+        and all(isinstance(x, float) for x in inputv.values())
+    ):
+        # this is Dict[str, float]
+        op_durations = inputv
+    else:
+        raise ValueError("Invalid type of inputv")
+
+    # map timings to ops
+    for op in visited_ops:
+        # profiler records the results under the original_name
+        op_name = op._attrs["original_name"]
+
+        # replace op_name with a unique name, if provided
+        if op_name is not None:
+            if op_name not in op_durations:
+                # op_name was not found in the profiler report
+                output.op_durations[op] = 0
+            else:
+                time_cost = op_durations[op_name]
+                output.op_durations[op] = time_cost
+        else:
+            # op_name is None, idk what to do
+            output.op_durations[op] = 0
+
+    # compute statistics
+    sorted_op_durations = sorted(op_durations.values())
+    if len(sorted_op_durations) > 0:
+        output.duration_p70 = sorted_op_durations[int(len(sorted_op_durations) * 0.7)]
+        output.duration_p90 = sorted_op_durations[int(len(sorted_op_durations) * 0.9)]
+        output.duration_p95 = sorted_op_durations[int(len(sorted_op_durations) * 0.95)]
+        output.duration_max = sorted_op_durations[-1]
+        output.total_duration = sum(sorted_op_durations)
+
+    # proceed with sequential execution:
+    unprocessed_seq_ops = deque(sequence_of_ops)
+    unprocessed_seq_tensors = deque(unprocessed_tensors)
+
+    global_timestamp = 0.0
+    execution_step = 0
+    while len(unprocessed_seq_ops) > 0 or len(unprocessed_seq_tensors) > 0:
+        # process operators
+        n_local_processed_ops = 0
+        for op in unprocessed_seq_ops:
+            depends_on = op._attrs["inputs"]
+
+            # are all prereqs complete?
+            can_proceed = all(
+                tensor in output.tensor_sequential_trackers for tensor in depends_on
+            )
+            if can_proceed:
+                # yes. This operator is ready to be executed.
+                execution_step += 1
+
+                op_duration = output.op_durations[op]
+
+                output.op_sequential_trackers[op] = TimestampTracking(
+                    execution_start=global_timestamp,
+                    duration=op_duration,
+                    execution_order=execution_step,
+                )
+
+                # modify global clock
+                global_timestamp += op_duration
+
+                n_local_processed_ops += 1
+            else:
+                # cannot go ahead, some tensors need to be marked as processed
+                break
+
+        for _ in range(0, n_local_processed_ops):
+            unprocessed_seq_ops.popleft()
+
+        # process tensors
+        n_local_processed_tensors = 0
+        for tensor in unprocessed_seq_tensors:
+            depends_on = tensor.src_ops()
+
+            # are all prereqs complete?
+            can_proceed = all(op in output.op_sequential_trackers for op in depends_on)
+            if can_proceed:
+                # yes. The tensor computation is finished.
+                max_execution_end = max(
+                    output.op_sequential_trackers[op].execution_end for op in depends_on
+                )
+                max_execution_order = max(
+                    output.op_sequential_trackers[op].execution_order
+                    for op in depends_on
+                )
+
+                output.tensor_sequential_trackers[tensor] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=0.0,
+                    execution_order=max_execution_order,
+                )
+
+                n_local_processed_tensors += 1
+            else:
+                # cannot proceed, some ops needs to be run first
+                break
+
+        for _ in range(0, n_local_processed_tensors):
+            unprocessed_seq_tensors.popleft()
+
+        # are we done?
+        if n_local_processed_ops == 0 and n_local_processed_tensors == 0:
+            # yes, no operators or tensors were processed on the current step.
+            # This does not imply that all operators and tensors were processed.
+            # Basically, this is a kinda early termination verification that
+            # indicates that there is some invalid profiler / graph data.
+            # So, we're trying to avoid infinite loops.
+            break
+
+    # process with parallel execution
+    unprocessed_par_ops = set(sequence_of_ops)
+    unprocessed_par_tensors = set(unprocessed_tensors)
+
+    execution_step = 0
+    while len(unprocessed_par_ops) > 0 or len(unprocessed_par_tensors) > 0:
+        # process operators
+        new_processed_ops: Set[Operator] = set()
+        for op in unprocessed_par_ops:
+            depends_on = op._attrs["inputs"]
+
+            # are all prereqs complete?
+            can_proceed = all(
+                tensor in output.tensor_parallel_trackers for tensor in depends_on
+            )
+            if can_proceed:
+                # yes. This operator is ready to be executed.
+                op_duration = output.op_durations[op]
+
+                if not depends_on:
+                    # a case of an operator that depends on no tensors
+                    max_execution_end = 0
+                else:
+                    # regular case
+                    max_execution_end = max(
+                        output.tensor_parallel_trackers[tensor].execution_end
+                        for tensor in depends_on
+                    )
+
+                output.op_parallel_trackers[op] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=op_duration,
+                    execution_order=execution_step,
+                )
+
+                new_processed_ops.add(op)
+
+        # ok, there were some processed operators
+        if len(new_processed_ops) > 0:
+            for op in new_processed_ops:
+                unprocessed_par_ops.remove(op)
+
+            execution_step += 1
+
+        # process tensors
+        new_processed_tensors: Set[Tensor] = set()
+        for tensor in unprocessed_par_tensors:
+            depends_on = tensor.src_ops()
+
+            # are all prereqs complete?
+            can_proceed = all(op in output.op_parallel_trackers for op in depends_on)
+            if can_proceed:
+                # yes. The tensor computation is finished.
+                max_execution_end = max(
+                    output.op_parallel_trackers[op].execution_end for op in depends_on
+                )
+                max_execution_order = max(
+                    output.op_parallel_trackers[op].execution_order for op in depends_on
+                )
+
+                output.tensor_parallel_trackers[tensor] = TimestampTracking(
+                    execution_start=max_execution_end,
+                    duration=0.0,
+                    execution_order=max_execution_order,
+                )
+
+                new_processed_tensors.add(tensor)
+
+        for tensor in new_processed_tensors:
+            unprocessed_par_tensors.remove(tensor)
+
+        # are we done?
+        if len(new_processed_ops) == 0 and len(new_processed_tensors) == 0:
+            # Same story: we're trying to avoid infinite loops.
+            break
+
+    # done
+    return output
+
+
+def split_simple_multistream_parallel_ops(ops_by_order, max_parallel_ops: int):
+    """
+    Make sure that no more than max_parallel_ops operators are run in parallel.
+
+    Say, on the first step op1, op2 and op3 can be executed in parallel.
+    On the second one, it is op4 and op5.
+    On the third one it is op6, op7, op8, op9.
+    Then, ops_by_order is something like
+      { 1: [op1, op2, op3], 2: [op4, op5], 3: [op6, op7, op8, op9] }
+    Given max_parallel_ops=2, the output will be:
+      [[op1, op2], [op3], [op4, op5], [op6, op7], [op8, op9]]
+
+    Parameters
+    ----------
+    ops_by_order : Dict[int, List[Operator]]
+        A dictionary, its keys represent the execution order
+        and its values represent operators that are executed in parallel.
+    max_parallel_ops : int
+        Number of operators that are allowed to be run in parallel
+
+    Output : List[List[Operator]]
+        transformed sequence of operators to execute.
+
+    """
+    assert max_parallel_ops > 0
+
+    # todo: a better splitting algorithm can be implemented,
+    # the one that splits operators into max_parallel_ops buckets
+    # so that the amount of needed memory is about the same.
+    # use priority_queue for this and iteratively add to the
+    # bucket that has the lowest 'assigned' memory.
+
+    output = []
+
+    execution_orders = sorted(ops_by_order.keys())
+    for execution_order in execution_orders:
+        ops = ops_by_order[execution_order]
+
+        ops_parallel = []
+        for op in ops:
+            ops_parallel.append(op)
+            if len(ops_parallel) >= max_parallel_ops:
+                output.append(ops_parallel)
+                ops_parallel = []
+
+        if len(ops_parallel) > 0:
+            output.append(ops_parallel)
+
+    # done
+    return output
diff --git a/python/aitemplate/utils/import_path.py b/python/aitemplate/utils/import_path.py
new file mode 100644
index 000000000..caaccd9f2
--- /dev/null
+++ b/python/aitemplate/utils/import_path.py
@@ -0,0 +1,27 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+from pathlib import Path
+
+
+def import_parent(filepath: str, level: int) -> None:
+    r_filepath = Path(filepath).resolve()
+    parent, top = r_filepath.parent, r_filepath.parents[level]
+
+    sys.path.append(str(top))
+    try:
+        sys.path.remove(str(parent))
+    except ValueError:  # Already removed
+        pass
diff --git a/python/aitemplate/utils/io.py b/python/aitemplate/utils/io.py
new file mode 100644
index 000000000..f6dc0e1f7
--- /dev/null
+++ b/python/aitemplate/utils/io.py
@@ -0,0 +1,237 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Util functions to handle file or network io
+"""
+import hashlib
+import logging
+import os
+import tarfile
+import time
+from io import BytesIO, FileIO
+from pathlib import Path
+from typing import BinaryIO, Callable, Optional, Union
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def touch(file_path):
+    """
+    Emulates the Linux 'touch' command by creating an empty file if it doesn't exist, or updating the modified timestamp if it does.
+
+    :param file_path: str: The path to the file to be created or updated.
+    :return: None
+    """
+    if not os.path.exists(file_path):
+        p = Path(file_path)
+        # ensure parent directory exists
+        os.makedirs(str(p.parent), exist_ok=True)
+        open(file_path, "w").close()
+
+    # Update the modified timestamp
+    os.utime(file_path)
+
+
+def file_age(file_path):
+    """
+    Returns the age of a file in seconds since its last modified timestamp.
+
+    :param file_path: str: The path to the file.
+    :return: float: The age of the file in seconds.
+    """
+    if not os.path.isfile(file_path):
+        return 3600 * 24 * 1000.0
+
+    # Get the current time and the file's last modification time
+    current_time = time.time()
+    file_mtime = os.path.getmtime(file_path)
+
+    # Calculate the file age in seconds
+    file_age_seconds = current_time - file_mtime
+
+    return file_age_seconds
+
+
+def file_sizes(directory, filter_function=None):
+    total_size = 0
+    for root, _dirs, files in os.walk(directory):
+        for _file in files:
+            file_path = os.path.join(root, _file)
+            if filter_function is not None and filter_function(file_path):
+                total_size += os.path.getsize(file_path)
+
+    return total_size
+
+
+# Utility functions to be used by (not yet existing) distributed cache implementations
+# to minimize the amount of network roundtrips and network bandwidth needed
+
+
+def create_archive(
+    directory_path: str,
+    filter_func: Callable[[str], bool] = None,
+    output_file: Optional[str] = None,
+) -> Optional[bytes]:
+    """Create tar.gz archive in-memory and return the archive contents as
+    a bytes object.
+
+    Args:
+        directory_path (str): Directory to create archive of.
+        filter_func (_type_, optional): A function which, being passed a filename,
+                                        returns whether to include it or not.
+                                        Defaults to None (include all).
+        output_file (str): Output filename to write the archive to. Usually it ends on .tar.gz.
+                           If set to None ( default), the archive will not be written to
+                           file but returned as a bytes object.
+
+    Returns:
+        Optional[bytes]: Archive contents as a bytes object if output_file was not None
+    """
+    # Archive files in a directory.
+
+    # Create an in-memory bytes buffer
+    if output_file is None:
+        buffer = BytesIO()
+    else:
+        buffer = FileIO(output_file, mode="w+")
+
+    # Determine the appropriate compression mode
+    compression_mode = None
+    compression_mode = "w:gz"
+
+    # Create a new archive file
+    with tarfile.open(fileobj=buffer, mode=compression_mode) as archive:
+        # Walk through the directory tree and add each file to the archive
+        for root, _, files in os.walk(directory_path):
+            for _file in files:
+                # Check if the file should be included based on the filter function
+                if filter_func is not None:
+                    if not filter_func(_file):
+                        continue
+
+                # Calculate the relative path of the file
+                relative_path = os.path.relpath(
+                    os.path.join(root, _file), directory_path
+                )
+
+                # Add the file to the archive with the relative path
+                archive.add(os.path.join(root, _file), arcname=relative_path)
+
+    # Get the bytes from the buffer
+    if output_file is not None:
+        buffer.close()
+        return None
+    buffer.seek(0)
+    compressed_bytes = buffer.read()
+
+    return compressed_bytes
+
+
+def extract_archive(
+    archive_data: BinaryIO, target_directory: str, overwrite: bool = False
+):
+    """Extract a tar.gz archive (written for example via create_archive) from a bytes buffer
+    into a target directory.
+
+    Args:
+        archive_data (BinaryIO): BinaryIO object ( typicall BytesIO or FileIO ) of the tar.gz archive to be extracted.
+        target_directory (str): Target directory to extract to.
+        overwrite (bool, optional): Whether to overwrite files or not.
+                                    If False, files will be silently skipped
+                                    if they already exist. Defaults to False.
+    """
+    archive = tarfile.open(fileobj=archive_data, mode="r:gz")
+
+    # Extract the archive contents into the target directory
+    for member in archive.getmembers():
+        # Calculate the full path of the extracted file or directory
+        target_path = os.path.join(target_directory, member.name)
+
+        # Check if the file or directory already exists
+        if os.path.exists(target_path):
+            if not overwrite:
+                _LOGGER.debug(
+                    f"extract_archive: Skipping extraction of file to {os.path.abspath(target_path)}: A file at that path already exists, and overwrite is not enabled."
+                )
+                continue
+            else:
+                _LOGGER.debug(
+                    f"extract_archive: Replacing existing file at {os.path.abspath(target_path)} with file from archive."
+                )
+                os.remove(target_path)
+
+        # Extract the file or directory from the archive
+        archive.extract(member, target_directory)
+
+    # Close the archive object
+    archive.close()
+
+
+def copytree_with_hash(
+    src_path: Union[Path, str],
+    dst_path: Union[Path, str],
+    buffer_size=1024 * 1024,
+    hash: Optional[hashlib.sha256] = None,
+    max_depth: int = 20,
+) -> Optional[str]:
+    """Copy a directory and its contents recursively, while at the same time calculating a hash over each file and filename.
+
+    :param src_path: Path: The path to the source directory.
+    :param dst_path: Path: The path to the destination directory.
+    :param buffer_size: int: The buffer size to read and write data in.
+    :param hash: Optional[hashlib.sha256]: The hash to use for calculating the hash. ( Default: None)
+    :max_depth: int : The maximum recursion depth. Default: 20
+    :return: None, if a hash instance was passed. Otherwise, the hash of the copied data and path names.
+    """
+
+    if hash is None:
+        hash_obj = hashlib.sha256()
+    else:
+        hash_obj = hash
+    if isinstance(src_path, str):
+        src_path = Path(src_path)
+    if isinstance(dst_path, str):
+        dst_path = Path(dst_path)
+    if dst_path.exists():
+        dst_path = dst_path.resolve()
+        if not dst_path.is_dir():
+            raise OSError("Target path exists and is not a directory.")
+        dst_path = dst_path / src_path.name
+    if src_path.is_file():
+        hash_obj.update(dst_path.name.encode("utf-8"))
+        # Copy the file to the destination
+        with open(dst_path, "wb") as dst_file:
+            with open(src_path, "rb") as src_file:
+                while True:
+                    data = src_file.read(buffer_size)
+                    if not data:
+                        break
+                    hash_obj.update(data)
+                    dst_file.write(data)
+    elif src_path.is_symlink():
+        new_src_path = src_path.resolve()
+        copytree_with_hash(new_src_path, dst_path, buffer_size, hash_obj, max_depth - 1)
+    elif src_path.is_dir():
+        # Recursively copy the directory contents
+        os.makedirs(dst_path, exist_ok=True)
+        for sub_path in sorted(src_path.iterdir()):
+            sub_dst_path = dst_path / sub_path.name
+            copytree_with_hash(
+                sub_path, sub_dst_path, buffer_size, hash_obj, max_depth - 1
+            )
+    else:
+        raise OSError(f"Source path {src_path} is neither file, directory nor symlink.")
+    if hash is None:
+        return hash_obj.hexdigest()
diff --git a/python/aitemplate/utils/json_utils.py b/python/aitemplate/utils/json_utils.py
new file mode 100644
index 000000000..8b28e42f9
--- /dev/null
+++ b/python/aitemplate/utils/json_utils.py
@@ -0,0 +1,148 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import json
+
+from typing import Dict, List
+
+from aitemplate.compiler.base import (
+    _HostConstantTensorData,
+    _NumpyConstantTensorData,
+    _TorchConstantTensorData,
+    IntImm,
+    IntVar,
+    Operator,
+    Tensor,
+)
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.tensor_accessor import TensorAccessor
+
+
+def gen_unique_op_names(sorted_graph: List[Tensor]) -> Dict[Operator, str]:
+    # List is used here, not Set, in order to maintain the order of operators,
+    # depending on memory locations, which may vary from run to run.
+    # Additionally, I don't expect to have too usages for a single op.
+    tmp: Dict[str, List[Operator]] = {}
+    for tensor in sorted_graph:
+        for src_op in tensor.src_ops():
+            op_name = src_op._attrs["name"]
+            if op_name is None:
+                continue
+
+            if op_name not in tmp:
+                tmp[op_name] = []
+            sub_dict = tmp[op_name]
+
+            if src_op not in sub_dict:
+                sub_dict.append(src_op)
+
+        for dst_op in tensor.dst_ops():
+            op_name = dst_op._attrs["name"]
+            if op_name is None:
+                continue
+
+            if op_name not in tmp:
+                tmp[op_name] = []
+            sub_dict = tmp[op_name]
+
+            if dst_op not in sub_dict:
+                sub_dict.append(dst_op)
+
+    # assemble the result
+    op_names: Dict[Operator, str] = {}
+
+    for op_name, ops in tmp.items():
+        if len(ops) == 1:
+            # the provided operator is unique, do not add one to the dict
+            continue
+
+        # add several unique names
+        for idx, op in enumerate(ops):
+            op_names[op] = f"{op_name} {idx}"
+
+    # done
+    return op_names
+
+
+class GraphJsonEncoder(json.JSONEncoder):
+    def __init__(self, op_names: Dict[Operator, str], *args, **kwargs):
+        super(GraphJsonEncoder, self).__init__(*args, **kwargs)
+
+        # This is a Dict that provides custom names for operators.
+        # It is possible that two instances of the same operator,
+        # say, 'fused_elementwise_123' is used twice in the graph,
+        # but with different inputs and/or outputs.
+        # As a result, there will be two instances of Operator object,
+        # holding the same name, which leads to invalid graph
+        # visualization / serialization.
+        # So, this diff allows to overcome this problem.
+        self.op_names: Dict[Operator, str] = op_names
+
+    def default(self, obj):
+        if isinstance(obj, FuncEnum):
+            return obj.name
+        if isinstance(obj, Tensor):
+            return self._jsonize_tensor(obj)
+        if isinstance(obj, Operator):
+            return self._jsonize_operator(obj)
+        if isinstance(obj, TensorAccessor):
+            return obj.__dict__
+        if isinstance(obj, IntImm):
+            return obj.__dict__
+        if isinstance(obj, IntVar):
+            return obj.__dict__
+        if isinstance(obj, _HostConstantTensorData):
+            return "_HostConstantTensorData"
+        if isinstance(obj, _TorchConstantTensorData):
+            return "_TorchConstantTensorData"
+        if isinstance(obj, _NumpyConstantTensorData):
+            return "_NumpyConstantTensorData"
+
+        return str(obj)
+
+    def _jsonize_tensor(self, tensor: Tensor):
+        output = {}
+        for key in tensor._attrs.keys():
+            if key in ("src_ops", "dst_ops") and tensor._attrs[key] is not None:
+                op_names = []
+                for op in tensor._attrs[key]:
+                    # check whether a name for an op is provided
+                    op_name = self.op_names.get(op, op._attrs["name"])
+                    op_names.append(op_name)
+
+                output[key] = op_names
+            else:
+                output[key] = tensor._attrs[key]
+        return output
+
+    def _jsonize_operator(self, op: Operator):
+        output = {}
+        for key in op._attrs.keys():
+            if (
+                key in ("inputs", "args", "outputs", "original_inputs")
+                and op._attrs[key] is not None
+            ):
+                output[key] = [x._attrs["name"] for x in op._attrs[key]]
+            elif key == "name":
+                # check whether a name for an op is provided.
+
+                # save the original name
+                op_name = op._attrs[key]
+                output["_original_op_name"] = op_name
+                # save the key
+                op_name = self.op_names.get(op, op._attrs[key])
+                output[key] = op_name
+            else:
+                output[key] = op._attrs[key]
+        return output
diff --git a/python/aitemplate/utils/logger.py b/python/aitemplate/utils/misc.py
similarity index 52%
rename from python/aitemplate/utils/logger.py
rename to python/aitemplate/utils/misc.py
index 7dfdba771..5ad5d26fc 100644
--- a/python/aitemplate/utils/logger.py
+++ b/python/aitemplate/utils/misc.py
@@ -13,30 +13,25 @@
 #  limitations under the License.
 #
 """
-default logger
+miscellaneous utilities
 """
+import hashlib
 import logging
 import os
+import platform
 
 
-def info(name, message):
-    logger = logging.getLogger(name)
-    logger.info(message)
-
-
-def debug(name, message):
-    logger = logging.getLogger(name)
-    logger.debug(message)
+def is_debug():
+    logger = logging.getLogger("aitemplate")
+    return logger.level == logging.DEBUG
 
 
-def warning(name, message):
-    logger = logging.getLogger(name)
-    logger.warning(message)
+def is_linux() -> bool:
+    return platform.system() == "Linux"
 
 
-def is_debug():
-    logger = logging.getLogger("aitemplate")
-    return logger.level == logging.DEBUG
+def is_windows() -> bool:
+    return os.name == "nt"
 
 
 def setup_logger(name):
@@ -56,3 +51,47 @@ def setup_logger(name):
     )
     root_logger.setLevel(LOG_LEVEL)
     return root_logger
+
+
+def short_str(s, length=8) -> str:
+    """
+    Returns a hashed string, somewhat similar to URL shortener.
+    """
+    hash_str = hashlib.sha256(s.encode()).hexdigest()
+    return hash_str[0:length]
+
+
+def callstack_stats(enable=False):
+    if enable:
+
+        def decorator(f):
+            import cProfile
+            import io
+            import pstats
+
+            logger = logging.getLogger(__name__)
+
+            def inner_function(*args, **kwargs):
+                pr = cProfile.Profile()
+                pr.enable()
+                result = f(*args, **kwargs)
+                pr.disable()
+                s = io.StringIO()
+                pstats.Stats(pr, stream=s).sort_stats(
+                    pstats.SortKey.CUMULATIVE
+                ).print_stats(30)
+                logger.debug(s.getvalue())
+                return result
+
+            return inner_function
+
+        return decorator
+    else:
+
+        def decorator(f):
+            def inner_function(*args, **kwargs):
+                return f(*args, **kwargs)
+
+            return inner_function
+
+        return decorator
diff --git a/python/aitemplate/utils/mk_ck_lib/__init__.py b/python/aitemplate/utils/mk_ck_lib/__init__.py
index 0988106cc..ecadc7f17 100644
--- a/python/aitemplate/utils/mk_ck_lib/__init__.py
+++ b/python/aitemplate/utils/mk_ck_lib/__init__.py
@@ -15,4 +15,10 @@
 
 # flake8: noqa
 
-from . import conv2d_operation, gemm_operation, generator, library, manifest
+from aitemplate.utils.mk_ck_lib import (
+    conv2d_operation,
+    gemm_operation,
+    generator,
+    library,
+    manifest,
+)
diff --git a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
index 85c1056c4..7f3def8dc 100644
--- a/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/conv2d_operation.py
@@ -22,7 +22,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 class Conv2DSpecialization(enum.Enum):
@@ -63,7 +63,7 @@ class XdlOpType(enum.Enum):
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Relu_Add: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
     XdlOpType.DeviceConv2d_Xdl_CShuffle_Bias_Sigmoid: "ck::tensor_operation::device::DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K",
-    XdlOpType.DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle",
+    XdlOpType.DeviceGroupedConv2D_Xdl_CShuffle_Bias_Relu: "ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle",
     XdlOpType.DeviceConvNdBwdDataNwcKxcNwk_Xdl: "ck::tensor_operation::device::DeviceConvNdBwdDataNwcKxcNwk_Xdl",
     XdlOpType.DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1: "ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1",
 }
@@ -266,7 +266,6 @@ def accumulator_type(self):
         return library.DataType.f32
 
     def emit(self) -> str:
-
         template = jinja2.Template(
             """
 using {{name}} = {{xdl_op_type}}<
diff --git a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
index 952237618..3b56c2b1f 100644
--- a/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/gemm_operation.py
@@ -20,7 +20,7 @@
 
 import jinja2
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 # import library
 
@@ -302,6 +302,8 @@ class GemmOperation:
     ds_dtype: List[library.DataType] = None
     ds_layout: List[library.LayoutType] = None
     e_dtype: library.DataType = None
+    loop_scheduler: str = ""
+    pipeline: str = ""
 
     def __str__(self) -> str:
         io_name = "{gemm_kind}_{gemm_specialization}_{a_dtype}{b_dtype}{c_dtype}_{a_layout}{b_layout}{c_layout}".format(
@@ -325,11 +327,13 @@ def __str__(self) -> str:
         extra_name = (
             "_CM" if library.ShortTensorOperationNames[self.extra_kind] == "CM" else ""
         )
-        return "{io_name}_{tile_name}_{epilogue_functor}".format(
+        return "{io_name}_{tile_name}_{epilogue_functor}_{scheduler}_{pipeline}".format(
             io_name=io_name,
             tile_name=tile_name,
             epilogue_functor=library.ShortTensorOperationNames[self.epilogue_functor]
             + extra_name,
+            scheduler=library.ShortSchedulerNames.get(self.loop_scheduler, "default"),
+            pipeline=library.ShortPipelineNames.get(self.pipeline, "v1"),
         )
 
     def accumulator_type(self):
@@ -469,6 +473,12 @@ def emit(self) -> str:
 {% else %}
     7, // src_dst_vector_dim
     1 // dst_scalar_per_vector
+{% endif %}
+{% if LoopScheduler %}
+    ,{{LoopScheduler}}
+{% endif %}
+{% if Pipeline %}
+    ,{{Pipeline}}
 {% endif %}
     >;
 """
@@ -513,6 +523,8 @@ def emit(self) -> str:
             EDType=library.DataTypeTag[self.e_dtype]
             if self.e_dtype is not None
             else "",
+            LoopScheduler=self.loop_scheduler,
+            Pipeline=self.pipeline
         )
 
 
diff --git a/python/aitemplate/utils/mk_ck_lib/generator.py b/python/aitemplate/utils/mk_ck_lib/generator.py
index a5c8c52a3..d8eeea412 100644
--- a/python/aitemplate/utils/mk_ck_lib/generator.py
+++ b/python/aitemplate/utils/mk_ck_lib/generator.py
@@ -15,7 +15,7 @@
 
 import copy
 
-from . import (
+from aitemplate.utils.mk_ck_lib import (
     conv2d_operation as conv,
     gemm_operation as gemm,
     groupnorm_operation as groupnorm,
@@ -24,6 +24,7 @@
     softmax_operation as softmax,
 )
 
+
 ###########################################################################################################
 # Convolution for 2D Fwd operations
 def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
@@ -193,215 +194,6 @@ def CreateConv2dFwdOperator(manifest, operation_kind, out_element_op, out_data_o
     return operations
 
 
-# Convolution for 2D Bwd operations
-def CreateConv2dBwdOperator(manifest, operation_kind, out_element_op, out_data_op=""):
-    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWC)
-    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKXC)
-    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNWK)
-
-    in_element_op = library.TensorOperation.PassThrough
-
-    tile_descriptions = [
-        conv.TileDesc(256, 256, 128, 4, 8, 32, 32, 4, 2),
-        conv.TileDesc(256, 128, 256, 4, 8, 32, 32, 2, 4),
-        conv.TileDesc(128, 128, 128, 4, 8, 32, 32, 4, 2),
-        conv.TileDesc(256, 128, 128, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(256, 64, 128, 4, 8, 32, 32, 1, 2),
-        conv.TileDesc(128, 32, 128, 4, 8, 32, 32, 1, 2),
-        conv.TileDesc(128, 64, 128, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(256, 128, 64, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(128, 128, 64, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(64, 64, 64, 4, 8, 32, 32, 2, 2),
-        conv.TileDesc(128, 128, 32, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(64, 64, 32, 4, 8, 32, 32, 2, 1),
-        conv.TileDesc(64, 32, 64, 4, 8, 32, 32, 1, 2),
-    ]
-
-    c_block_descriptions = [
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-        conv.CBlockTransferDesc(1, 1, [1, 16, 1, 4], 8),
-    ]
-
-    block_descriptions = []
-    for t in tile_descriptions:
-        block_transfer = -1
-        if t.block_size == 256:
-            block_transfer = [4, 64, 1]
-        if t.block_size == 128:
-            block_transfer = [4, 32, 1]
-        if t.block_size == 64:
-            block_transfer = [4, 16, 1]
-        assert (
-            block_transfer != -1
-            and "Cannot determine block_transfer_size with block_size "
-            + str(t.block_size)
-        )
-        block_descriptions.append(
-            conv.BlockTransferDesc(block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
-        )
-    b_block_scalars = [2, 4, 4, 2, 2, 4, 4, 1, 2, 4, 1, 2, 2]
-
-    conv2d_specialization = [
-        conv.Conv2DSpecialization.ConvBwdDataDefault,
-        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
-    ]
-    gemm_spec = conv.Conv2DSpecialization.GemmDefault
-
-    operations = []
-    for conv2d_spec in conv2d_specialization:
-        for tile_desc, block_desc, b_scalar, c_block_desc in zip(
-            tile_descriptions,
-            block_descriptions,
-            b_block_scalars,
-            c_block_descriptions,
-        ):
-            b_block_desc = copy.deepcopy(block_desc)
-            b_block_desc.src_vector_dim = 1
-            b_block_desc.src_scalar_per_vector = b_scalar
-            new_operation = conv.Conv2DOperation(
-                operation_kind=operation_kind,
-                extra_kind=out_element_op,
-                xdl_op_type=conv.XdlOpType(operation_kind.value),
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=in_element_op,
-                b_elem_op=in_element_op,
-                epilogue_functor=out_element_op,
-                c_data_op=out_data_op,
-                conv2d_specialization=conv2d_spec,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
-    return operations
-
-
-# Convolution for 2D Bwd + Bias operations
-def CreateConv2dBwdBiasOperator(
-    manifest, operation_kind, out_element_op, out_data_op=""
-):
-    a_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWK)
-    b_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GKYXC)
-    c_element_desc = library.TensorDesc(library.DataType.f16, library.LayoutType.GNHWC)
-
-    in_element_op = library.TensorOperation.PassThrough
-
-    tile_descriptions = [
-        gemm.TileDesc(256, 256, 128, 32, 8, 2, 32, 32, 4, 2),
-        gemm.TileDesc(256, 256, 128, 32, 8, 8, 32, 32, 4, 2),
-        gemm.TileDesc(256, 128, 256, 32, 8, 2, 32, 32, 2, 4),
-        gemm.TileDesc(256, 128, 256, 32, 8, 8, 32, 32, 2, 4),
-        gemm.TileDesc(128, 128, 128, 32, 8, 2, 32, 32, 4, 2),
-        gemm.TileDesc(128, 128, 128, 32, 8, 8, 32, 32, 4, 2),
-        gemm.TileDesc(256, 128, 128, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(256, 128, 128, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(128, 128, 64, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(128, 128, 64, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(128, 64, 128, 32, 8, 2, 32, 32, 2, 2),
-        gemm.TileDesc(128, 64, 128, 32, 8, 8, 32, 32, 2, 2),
-        gemm.TileDesc(256, 128, 64, 32, 8, 2, 32, 32, 2, 1),
-        gemm.TileDesc(256, 128, 64, 32, 8, 8, 32, 32, 2, 1),
-        gemm.TileDesc(256, 64, 128, 32, 8, 2, 32, 32, 1, 2),
-        gemm.TileDesc(256, 64, 128, 32, 8, 8, 32, 32, 1, 2),
-    ]
-
-    b_block_descriptions = [
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([8, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 8, 1),
-        gemm.BlockTransferDesc([16, 16, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 1, 8, 1),
-        gemm.BlockTransferDesc([8, 32, 1], [0, 2, 1], [0, 2, 1], 1, 4, 2, 0),
-        gemm.BlockTransferDesc([4, 64, 1], [0, 2, 1], [0, 2, 1], 1, 2, 8, 1),
-    ]
-    a_block_descriptions = []
-    c_block_descriptions = []
-    for t in tile_descriptions:
-        a_block_transfer = -1
-        c_block_transfer = -1
-        if t.block_size == 256:
-            a_block_transfer = [4, 64, 1]
-            c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 8], 8)
-        if t.block_size == 128:
-            a_block_transfer = [4, 32, 1]
-            if t.n_per_block == 128:
-                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 16, 1, 8], 8)
-            if t.n_per_block == 64:
-                c_block_transfer = gemm.CBlockTransferDesc(1, 1, [1, 32, 1, 4], 8)
-
-        assert (
-            a_block_transfer != -1
-            and c_block_transfer != -1
-            and "Cannot determine block_transfer_size with block_size "
-            + str(t.block_size)
-        )
-        a_block_descriptions.append(
-            gemm.BlockTransferDesc(a_block_transfer, [1, 0, 2], [1, 0, 2], 2, 8, 8, 1)
-        )
-        c_block_descriptions.append(c_block_transfer)
-
-    conv2d_specialization = [
-        conv.Conv2DSpecialization.ConvBwdDataDefault,
-        conv.Conv2DSpecialization.ConvBwd1x1S1P0,
-    ]
-    gemm_spec = conv.Conv2DSpecialization.GemmDefault
-
-    operations = []
-    for conv2d_spec in conv2d_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = conv.Conv2DOperation(
-                operation_kind=operation_kind,
-                extra_kind=out_element_op,
-                xdl_op_type=conv.XdlOpType(operation_kind.value),
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=in_element_op,
-                b_elem_op=in_element_op,
-                epilogue_functor=out_element_op,
-                c_data_op=out_data_op,
-                conv2d_specialization=conv2d_spec,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
-    return operations
-
-
 ###########################################################################################################
 # Gemm operations
 def CreateGemmRRROperator(manifest):
@@ -484,32 +276,42 @@ def CreateGemmRRROperator(manifest):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+    
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+                    tile_descriptions,
+                    a_block_descriptions,
+                    b_block_descriptions,
+                    c_block_descriptions,
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=a_block_desc,
+                        b_block_transfer=b_block_desc,
+                        c_block_transfer=c_block_desc,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline,
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -574,33 +376,43 @@ def CreateGemmRCROperator(manifest):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmXdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
-def CreateGemmRCRBillinearOperator(manifest, c_element_op):
+def CreateGemmRCRBilinearOperator(manifest, c_element_op):
     operation_kind = library.GemmKind.Gemm
     a_element_desc = library.TensorDesc(
         library.DataType.f16, library.LayoutType.RowMajor
@@ -686,32 +498,42 @@ def CreateGemmRCRBillinearOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
     for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
     if c_element_op in [
         library.TensorOperation.Add,  # gemm_rcr_bias
@@ -719,67 +541,79 @@ def CreateGemmRCRBillinearOperator(manifest, c_element_op):
     ]:
         # N % 8 == 0 && K % 1 == 0
         gemm_spec = gemm.GemmSpecialization.MNKPadding
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            c_block_desc = copy.deepcopy(c_block_desc)
-            c_block_desc.scalar_per_vector = 1
-            c_block_desc.m_n_block_wave_per_xdl[1] //= 8
-            c_block_desc.m_n_block_wave_per_xdl[-1] *= 8
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    c_block_desc = copy.deepcopy(c_block_desc)
+                    c_block_desc.scalar_per_vector = 1
+                    c_block_desc.m_n_block_wave_per_xdl[1] //= 8
+                    c_block_desc.m_n_block_wave_per_xdl[-1] *= 8
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
         # N % 4 == 0 && K % 4 == 0
         gemm_spec = gemm.GemmSpecialization.MNKPadding
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            block_desc.src_scalar_per_vector = 4
-            block_desc.dst_scalar_per_vector = 4
-            c_block_desc = copy.deepcopy(c_block_desc)
-            c_block_desc.scalar_per_vector = 4
-            c_block_desc.m_n_block_wave_per_xdl[1] //= 2
-            c_block_desc.m_n_block_wave_per_xdl[-1] *= 2
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                ds_layout=ds_layout,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+        for loop_scheduler in loop_schedulers:
+            for pipeline in pipelines:
+                if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                    continue
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    block_desc.src_scalar_per_vector = 4
+                    block_desc.dst_scalar_per_vector = 4
+                    c_block_desc = copy.deepcopy(c_block_desc)
+                    c_block_desc.scalar_per_vector = 4
+                    c_block_desc.m_n_block_wave_per_xdl[1] //= 2
+                    c_block_desc.m_n_block_wave_per_xdl[-1] *= 2
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmMultipleD_Xdl_CShuffle,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        ds_layout=ds_layout,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
 
     return operations
 
@@ -924,31 +758,41 @@ def CreateGemmRCRPermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
-    for gemm_spec in gemm_specialization:
-        for tile_desc, block_desc, c_block_desc in zip(
-            tile_descriptions, block_descriptions, c_block_descriptions
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=block_desc,
-                b_block_transfer=block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+    for loop_scheduler in loop_schedulers:
+        for pipeline in pipelines:
+            if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                continue
+            for gemm_spec in gemm_specialization:
+                for tile_desc, block_desc, c_block_desc in zip(
+                    tile_descriptions, block_descriptions, c_block_descriptions
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=block_desc,
+                        b_block_transfer=block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -1037,34 +881,44 @@ def CreateGemmRRRPermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    loop_schedulers = ["ck::LoopScheduler::Default", "ck::LoopScheduler::Interwave"]
+    pipelines = ["ck::PipelineVersion::v1", "ck::PipelineVersion::v2"]
+
     operations = []
-    for gemm_spec in gemm_specialization:
-        for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
-            tile_descriptions,
-            a_block_descriptions,
-            b_block_descriptions,
-            c_block_descriptions,
-        ):
-            new_operation = gemm.GemmOperation(
-                operation_kind=operation_kind,
-                extra_kind=c_element_op,
-                xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
-                A=a_element_desc,
-                B=b_element_desc,
-                C=c_element_desc,
-                a_elem_op=element_op,
-                b_elem_op=element_op,
-                epilogue_functor=c_element_op,
-                gemm_specialization=gemm_spec,
-                tile_desc=tile_desc,
-                a_block_transfer=a_block_desc,
-                b_block_transfer=b_block_desc,
-                c_block_transfer=c_block_desc,
-                ds_dtype=ds_dtype,
-                e_dtype=e_dtype,
-            )
-            manifest.append(new_operation)
-            operations.append(new_operation)
+    for loop_scheduler in loop_schedulers:
+        for pipeline in pipelines:
+            if pipeline == "ck::PipelineVersion::v2" and loop_scheduler == "ck::LoopScheduler::Interwave":
+                continue
+            for gemm_spec in gemm_specialization:
+                for tile_desc, a_block_desc, b_block_desc, c_block_desc in zip(
+                    tile_descriptions,
+                    a_block_descriptions,
+                    b_block_descriptions,
+                    c_block_descriptions,
+                ):
+                    new_operation = gemm.GemmOperation(
+                        operation_kind=operation_kind,
+                        extra_kind=c_element_op,
+                        xdl_op_type=gemm.XdlOpType.DeviceGemmBiasCPermute_Xdl,
+                        A=a_element_desc,
+                        B=b_element_desc,
+                        C=c_element_desc,
+                        a_elem_op=element_op,
+                        b_elem_op=element_op,
+                        epilogue_functor=c_element_op,
+                        gemm_specialization=gemm_spec,
+                        tile_desc=tile_desc,
+                        a_block_transfer=a_block_desc,
+                        b_block_transfer=b_block_desc,
+                        c_block_transfer=c_block_desc,
+                        ds_dtype=ds_dtype,
+                        e_dtype=e_dtype,
+                        loop_scheduler=loop_scheduler,
+                        pipeline=pipeline
+                    )
+                    manifest.append(new_operation)
+                    operations.append(new_operation)
     return operations
 
 
@@ -1132,6 +986,7 @@ def CreateGemmRCRm2n3PermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
     operations = []
     for gemm_spec in gemm_specialization:
         for tile_desc, block_desc, c_block_desc in zip(
@@ -1224,6 +1079,8 @@ def CreateGemmRCRm3n2PermOperator(manifest, c_element_op):
         gemm.GemmSpecialization.GemmDefault,
         gemm.GemmSpecialization.MNKPadding,
     ]
+
+    
     operations = []
     for gemm_spec in gemm_specialization:
         for tile_desc, block_desc, c_block_desc in zip(
@@ -1390,7 +1247,6 @@ def CreateBmmSoftmaxBmmOperator(
     ]
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
@@ -1505,7 +1361,6 @@ def CreateBmmSoftmaxBmmPermOperator(
 
     c_block_descriptions, b1_block_descriptions = [], []
     for i in range(len(tile_descriptions)):
-
         if i in [0, 2, 4, 5, 9, 11]:
             block_transfer = [16, 16, 1]
         else:
@@ -2317,19 +2172,24 @@ def CreateLayerNormOperator(manifest, rank=2):
     out_dtype = library.DataType.f16
     # 0 indicates not print
     tile_descriptions = [
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
-        layernorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        layernorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        layernorm.TileDesc(128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1),
+        layernorm.TileDesc(64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2),
+        layernorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        layernorm.TileDesc(1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
     ]
 
     operations = []
@@ -2354,19 +2214,24 @@ def CreateGroupNormOperator(manifest, rank=5):
     out_dtype = library.DataType.f16
     # 0 indicates not print
     tile_descriptions = [
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4),
-        groupnorm.TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8),
-        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2),
+        groupnorm.TileDesc(128, 1, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 2, 1, 2, 1, 2, 1, 2, 2, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 4, 1, 4, 1, 4, 1, 4, 4, 1),
+        groupnorm.TileDesc(64, 1, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(128, 1, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(256, 1, 256, 2, 16, 1, 8, 1, 8, 1, 8, 8, 2),
+        groupnorm.TileDesc(256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(512, 1, 512, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
+        groupnorm.TileDesc(1024, 1, 1024, 1, 16, 1, 8, 1, 8, 1, 8, 8, 1),
     ]
 
     operations = []
@@ -2431,56 +2296,42 @@ def GenerateTensorOp(manifest):
         library.TensorOperation.AddSigmoid,
         library.MemoryDataOperation.MemorySet,
     )
-    # TranposedConv2d
-    CreateConv2dBwdOperator(
-        manifest,
-        library.Conv2dKind.TransposedConv2d,
-        library.TensorOperation.PassThrough,
-        library.MemoryDataOperation.MemorySet,
-    )
-    # TranposedConv2dBiasRelu
-    CreateConv2dBwdBiasOperator(
-        manifest,
-        library.Conv2dKind.TransposedConv2dBiasRelu,
-        library.TensorOperation.AddRelu,
-        library.MemoryDataOperation.MemorySet,
-    )
     # GemmRRR
     CreateGemmRRROperator(manifest)
     # GemmRCR
     CreateGemmRCROperator(manifest)
     # GemmRCRBias
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.Add)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.Add)
     # GemmRCRBiasRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddRelu)
     # GemmRCRBiasTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddTanh)
     # GemmRCRBiasTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddFastGelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddFastGelu)
     # GemmRCRBiasHardswish
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddHardswish)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddHardswish)
     # GemmRCRBiasSwish
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSwish)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSwish)
     # GemmRCRBiasSigmoid
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoid)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoid)
     # GemmRCRBiasAdd
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAdd)
     # GemmRCRBiasMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMul)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMul)
     # GemmRCRBiasMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMulTanh)
     # GemmRCRBiasAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddRelu)
     # GemmRCRBiasAddAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddAdd)
     # GemmRCRBiasAddAddRelu
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddAddAddRelu)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddAddAddRelu)
     # GemmRCRBiasSigmoidMul
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMul)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoidMul)
     # GemmRCRBiasSigmoidMulTanh
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddSigmoidMulTanh)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddSigmoidMulTanh)
     # GemmRCRBiasMulAdd
-    CreateGemmRCRBillinearOperator(manifest, library.TensorOperation.AddMulAdd)
+    CreateGemmRCRBilinearOperator(manifest, library.TensorOperation.AddMulAdd)
     # BmmRCR
     CreateBmmRCROperator(manifest)
     # BmmRRR
@@ -2525,6 +2376,14 @@ def GenerateTensorOp(manifest):
 def GenerateGFX908(manifest, rocm_version):
     GenerateTensorOp(manifest)
 
-
 def GenerateGFX90A(manifest, rocm_version):
     GenerateTensorOp(manifest)
+
+def GenerateGFX940(manifest, rocm_version):
+    GenerateTensorOp(manifest)
+
+def GenerateGFX941(manifest, rocm_version):
+    GenerateTensorOp(manifest)
+
+def GenerateGFX942(manifest, rocm_version):
+    GenerateTensorOp(manifest)
diff --git a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
index 55e0d9533..605f6500a 100644
--- a/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/groupnorm_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
@@ -36,6 +36,7 @@ class TileDesc:
     beta_src_dim: int
     beta_src_size: int
     out_dst_size: int
+    save_mean_inv_std: int
 
     def __str__(self) -> str:
         values = list(self.__dict__.values())
@@ -78,12 +79,13 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationFwdImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
     {{AccDType}},
     {{OutDType}},
+    {{AccDType}},
     YElementOp,
     {{Rank}},
     {{NumReduceDim}},
@@ -113,7 +115,7 @@ def emit(self) -> str:
         Out=library.DataType.f16,
         Rank=5,
         NumReduceDim=3,
-        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
     )
     print(str(GroupNormOp))
     print(GroupNormOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
index fc5df0db3..52b4b70d3 100644
--- a/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/layernorm_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
@@ -36,6 +36,7 @@ class TileDesc:
     beta_src_dim: int
     beta_src_size: int
     out_dst_size: int
+    save_mean_inv_std: int
 
     def __str__(self) -> str:
         values = list(self.__dict__.values())
@@ -78,12 +79,13 @@ def accumulator_type(self):
     def emit(self) -> str:
         template = jinja2.Template(
             """
-using {{name}} = ck::tensor_operation::device::DeviceNormalizationImpl<
+using {{name}} = ck::tensor_operation::device::DeviceNormalizationFwdImpl<
     {{InDType}},
     {{InDType}},
     {{InDType}},
     {{AccDType}},
     {{OutDType}},
+    {{AccDType}},
     ck::tensor_operation::element_wise::PassThrough,
     {{Rank}},
     {{NumReduceDim}},
@@ -113,7 +115,7 @@ def emit(self) -> str:
         Out=library.DataType.f16,
         Rank=3,
         NumReduceDim=-1,
-        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8),
+        tile_desc=TileDesc(256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8, 1),
     )
     print(str(LayerNormOp))
     print(LayerNormOp.emit())
diff --git a/python/aitemplate/utils/mk_ck_lib/library.py b/python/aitemplate/utils/mk_ck_lib/library.py
index e4de8af38..5808ccacf 100644
--- a/python/aitemplate/utils/mk_ck_lib/library.py
+++ b/python/aitemplate/utils/mk_ck_lib/library.py
@@ -201,6 +201,15 @@ class LayoutType(enum.Enum):
     LayoutType.GNWK: "GNWK",
 }
 
+ShortSchedulerNames = {
+    "ck::LoopScheduler::Default": "default",
+    "ck::LoopScheduler::Interwave": "interwave"
+}
+
+ShortPipelineNames = {
+    "ck::PipelineVersion::v1": "v1",
+    "ck::PipelineVersion::v2": "v2"
+}
 #
 class OperationKind(enum.Enum):
     Gemm = auto()
diff --git a/python/aitemplate/utils/mk_ck_lib/manifest.py b/python/aitemplate/utils/mk_ck_lib/manifest.py
index b6f5c6c0d..c572737d8 100644
--- a/python/aitemplate/utils/mk_ck_lib/manifest.py
+++ b/python/aitemplate/utils/mk_ck_lib/manifest.py
@@ -22,7 +22,7 @@
 import os.path
 import re
 
-from .library import OperationKind, OperationKindNames
+from aitemplate.utils.mk_ck_lib.library import OperationKind, OperationKindNames
 
 
 class Manifest:
@@ -87,7 +87,6 @@ def get_kernel_filters(self, kernelListFile):
             return []
 
     def filter_out_kernels(self, kernel_name, kernel_filter_list):
-
         for kernel_filter_re in kernel_filter_list:
             if kernel_filter_re.search(kernel_name) is not None:
                 return True
diff --git a/python/aitemplate/utils/mk_ck_lib/softmax_operation.py b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
index f280236d1..9684137bf 100644
--- a/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
+++ b/python/aitemplate/utils/mk_ck_lib/softmax_operation.py
@@ -19,7 +19,7 @@
 
 # import library
 
-from . import library
+from aitemplate.utils.mk_ck_lib import library
 
 
 @dataclass
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
index 9aa3aade9..aa674561b 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_enum.py
@@ -26,13 +26,14 @@ class EpilogueFunctor(enum.Enum):
   LinearCombinationSigmoid = enum_auto()
   LinearCombinationTanh = enum_auto()
   LinearCombinationResidualBlock = enum_auto()
-  LinearCombinationResidualBlockV2 = enum_auto()
   LinearCombinationHardSwish = enum_auto()
   LinearCombinationGELU = enum_auto()
   LinearCombinationFastGELU = enum_auto()
   LinearCombinationSilu = enum_auto()
+  LinearCombinationELUp1 = enum_auto()
   LeftSiLUAndMul = enum_auto()
   LeftFastGeluAndMul = enum_auto()
+  Div = enum_auto()
 
 EpilogueFunctorTag = {
   EpilogueFunctor.LinearCombination:
@@ -47,8 +48,6 @@ class EpilogueFunctor(enum.Enum):
     'cutlass::epilogue::thread::LinearCombinationTanh',
   EpilogueFunctor.LinearCombinationResidualBlock:
     'cutlass::epilogue::thread::LinearCombinationResidualBlock',
-  EpilogueFunctor.LinearCombinationResidualBlockV2:
-    'cutlass::epilogue::thread::LinearCombinationResidualBlockV2',
   EpilogueFunctor.LinearCombinationHardSwish:
     'cutlass::epilogue::thread::LinearCombinationHardSwish',
   EpilogueFunctor.LinearCombinationGELU:
@@ -57,10 +56,14 @@ class EpilogueFunctor(enum.Enum):
     'cutlass::epilogue::thread::LinearCombinationFastGELU',
   EpilogueFunctor.LinearCombinationSilu:
     'cutlass::epilogue::thread::LinearCombinationSilu',
+  EpilogueFunctor.LinearCombinationELUp1:
+    'cutlass::epilogue::thread::LinearCombinationELUp1',
   EpilogueFunctor.LeftSiLUAndMul:
     'cutlass::epilogue::thread::LeftSiLUAndMul',
   EpilogueFunctor.LeftFastGeluAndMul:
     'cutlass::epilogue::thread::LeftFastGeluAndMul',
+  EpilogueFunctor.Div:
+    'cutlass::epilogue::thread::Div',
 }
 
 EpilogueFunctorName = {
@@ -70,13 +73,14 @@ class EpilogueFunctor(enum.Enum):
   "LinearCombinationSigmoid": EpilogueFunctor.LinearCombinationSigmoid,
   "LinearCombinationTanh": EpilogueFunctor.LinearCombinationTanh,
   "LinearCombinationResidualBlock": EpilogueFunctor.LinearCombinationResidualBlock,
-  "LinearCombinationResidualBlockV2": EpilogueFunctor.LinearCombinationResidualBlockV2,
   "LinearCombinationHardSwish": EpilogueFunctor.LinearCombinationHardSwish,
   "LinearCombinationGELU": EpilogueFunctor.LinearCombinationGELU,
   "LinearCombinationFastGELU": EpilogueFunctor.LinearCombinationFastGELU,
   "LinearCombinationSilu": EpilogueFunctor.LinearCombinationSilu,
+  "LinearCombinationELUp1": EpilogueFunctor.LinearCombinationELUp1,
   "LeftSiLUAndMul": EpilogueFunctor.LeftSiLUAndMul,
   "LeftFastGeluAndMul": EpilogueFunctor.LeftFastGeluAndMul,
+  "Div": EpilogueFunctor.Div,
 }
 
 class EpilogueMath(enum.Enum):
@@ -88,7 +92,8 @@ class EpilogueMath(enum.Enum):
   Plus = enum_auto()
   Gelu = enum_auto()
   FastGelu = enum_auto()
-  Silu = enum_auto()
+  SiLu = enum_auto()
+  ELUp1 = enum_auto()
 
 
 EpilogueMathTag = {
@@ -100,7 +105,8 @@ class EpilogueMath(enum.Enum):
   EpilogueMath.Plus: 'cutlass::plus',
   EpilogueMath.Gelu: 'GELU',
   EpilogueMath.FastGelu: 'GELU_taylor',
-  EpilogueMath.FastGelu: 'cutlass::epilogue::thread::Silu'
+  EpilogueMath.SiLu: 'cutlass::epilogue::thread::SiLu',
+  EpilogueMath.ELUp1: 'cutlass::epilogue::thread::ELUp1',
 }
 
 EpilogueMathName = {
@@ -113,7 +119,8 @@ class EpilogueMath(enum.Enum):
   "Add": EpilogueMath.Plus,
   "Gelu": EpilogueMath.Gelu,
   "FastGelu": EpilogueMath.FastGelu,
-  "Silu": EpilogueMath.Silu
+  "SiLu": EpilogueMath.SiLu,
+  "ELUp1": EpilogueMath.ELUp1
 }
 
 class EpiloguePermuteLayout(enum.Enum):
@@ -124,9 +131,9 @@ class EpiloguePermuteLayout(enum.Enum):
   NoPermute = enum_auto()
 
 EpiloguePermuteLayoutTag = {
-  EpiloguePermuteLayout.Permute5D_20314: 'cutlass::layout::Tensor5DPermute20314',
-  EpiloguePermuteLayout.Permute4D_0213: 'cutlass::layout::Tensor4DPermute0213',
-  EpiloguePermuteLayout.Permute4DBMM_0213: 'cutlass::layout::Tensor4DPermuteBMM0213',
+  EpiloguePermuteLayout.Permute5D_20314: 'cutlass::layout::Tensor5DPermute20314RowMajor',
+  EpiloguePermuteLayout.Permute4D_0213: 'cutlass::layout::Tensor4DPermute0213RowMajor',
+  EpiloguePermuteLayout.Permute4DBMM_0213: 'cutlass::layout::Tensor4DPermuteBMM0213RowMajor',
   EpiloguePermuteLayout.NoPermute: 'cutlass::layout::NoPermute',
   # EpiloguePermuteLayout.Permute3DBMM_021: 'cutlass::layout::Tensor3DPermute021BMM',
 }
@@ -139,6 +146,161 @@ class EpiloguePermuteLayout(enum.Enum):
   # "Permute3DBMM_021": EpiloguePermuteLayout.Permute3DBMM_021,
 }
 
+class EpilogueScheduleType(enum.Enum):
+  ScheduleAuto = enum_auto()
+  EpilogueTransposed = enum_auto()
+  NoSmemWarpSpecialized = enum_auto()
+  TmaWarpSpecialized = enum_auto()
+  TmaWarpSpecializedCooperative = enum_auto()
+  TmaWarpSpecializedElementwiseRelu = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseRelu = enum_auto()
+  TmaWarpSpecializedElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedElementwiseTanh = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseTanh = enum_auto()
+  TmaWarpSpecializedElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedElementwiseGELU = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseGELU = enum_auto()
+  TmaWarpSpecializedElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedCooperativeElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedBiasElementwise = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwise = enum_auto()
+  TmaWarpSpecializedBiasElementwiseRelu = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseRelu = enum_auto()
+  TmaWarpSpecializedBiasElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseSigmoid = enum_auto()
+  TmaWarpSpecializedBiasElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseSiLu = enum_auto()
+  TmaWarpSpecializedBiasElementwiseTanh = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseTanh = enum_auto()
+  TmaWarpSpecializedBiasElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseHardSwish = enum_auto()
+  TmaWarpSpecializedBiasElementwiseGELU = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseGELU = enum_auto()
+  TmaWarpSpecializedBiasElementwiseFastGELU = enum_auto()
+  TmaWarpSpecializedCooperativeBiasElementwiseFastGELU = enum_auto()
+
+EpilogueScheduleTag = {
+  EpilogueScheduleType.ScheduleAuto: 'cutlass::epilogue::collective::EpilogueScheduleAuto',
+  EpilogueScheduleType.EpilogueTransposed: 'cutlass::gemm::EpilogueTransposed',
+  EpilogueScheduleType.NoSmemWarpSpecialized: 'cutlass::epilogue::NoSmemWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecialized: 'cutlass::epilogue::TmaWarpSpecialized',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: 'cutlass::epilogue::TmaWarpSpecializedCooperative',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::ReLu>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::ReLu>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::Sigmoid>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::Sigmoid>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::SiLu>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::SiLu>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::Tanh>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::Tanh>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::HardSwish>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::HardSwish>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::GELU>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU>',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedElementwise<cutlass::epilogue::thread::GELU_taylor>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeElementwise<cutlass::epilogue::thread::GELU_taylor>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Identity, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::ReLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::ReLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Sigmoid, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Sigmoid, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::SiLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::SiLu, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::Tanh, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::Tanh, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::HardSwish, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::HardSwish, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::GELU, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::GELU, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedBiasElementwise<cutlass::epilogue::thread::GELU_taylor, elem_input_type, cutlass::plus, false, elem_input_type>',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU: 'cutlass::epilogue::TmaWarpSpecializedCooperativeBiasElementwise<cutlass::epilogue::thread::GELU_taylor, elem_input_type, cutlass::plus, false, elem_input_type>',
+}
+
+EpilogueScheduleSuffixes = {
+  EpilogueScheduleType.ScheduleAuto: '',
+  EpilogueScheduleType.EpilogueTransposed: '',
+  EpilogueScheduleType.NoSmemWarpSpecialized: '_epi_nosmem',
+  EpilogueScheduleType.TmaWarpSpecialized: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: '_epi_tma',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: '_epi_tma_relu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: '_epi_tma_relu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: '_epi_tma_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: '_epi_tma_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: '_epi_tma_silu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: '_epi_tma_silu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: '_epi_tma_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: '_epi_tma_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: '_epi_tma_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: '_epi_tma_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: '_epi_tma_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: '_epi_tma_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: '_epi_tma_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: '_epi_tma_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwise: '_epi_tma_bias',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise: '_epi_tma_bias',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu: '_epi_tma_bias_relu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu: '_epi_tma_bias_relu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid: '_epi_tma_bias_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid: '_epi_tma_bias_sigmoid',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu: '_epi_tma_bias_silu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu: '_epi_tma_bias_silu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh: '_epi_tma_bias_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh: '_epi_tma_bias_tanh',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish: '_epi_tma_bias_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish: '_epi_tma_bias_hardswish',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU: '_epi_tma_bias_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU: '_epi_tma_bias_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU: '_epi_tma_bias_fast_gelu',
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU: '_epi_tma_bias_fast_gelu',
+}
+
+EpilogueScheduleMapping = {
+  EpilogueScheduleType.TmaWarpSpecialized: {
+    EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu,
+    EpilogueFunctor.LinearCombinationSigmoid: EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid,
+    EpilogueFunctor.LinearCombinationSilu: EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu,
+    EpilogueFunctor.LinearCombinationTanh: EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh,
+    EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish,
+    EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU,
+    EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU,
+    EpilogueFunctor.LinearCombinationResidualBlock: EpilogueScheduleType.TmaWarpSpecialized,
+  },
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: {
+    EpilogueFunctor.LinearCombinationRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu,
+    EpilogueFunctor.LinearCombinationSigmoid: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid,
+    EpilogueFunctor.LinearCombinationSilu: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu,
+    EpilogueFunctor.LinearCombinationTanh: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh,
+    EpilogueFunctor.LinearCombinationHardSwish: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish,
+    EpilogueFunctor.LinearCombinationGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU,
+    EpilogueFunctor.LinearCombinationFastGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU,
+    EpilogueFunctor.LinearCombinationResidualBlock: EpilogueScheduleType.TmaWarpSpecializedCooperative,
+  },
+}
+
+EpilogueScheduleBiasElementwiseMapping = {
+  EpilogueScheduleType.TmaWarpSpecialized: EpilogueScheduleType.TmaWarpSpecializedBiasElementwise,
+  EpilogueScheduleType.TmaWarpSpecializedCooperative: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwise,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseRelu: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseRelu,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseRelu: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseRelu,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSigmoid: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSigmoid,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSigmoid: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSigmoid,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseSiLu: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseSiLu,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseSiLu: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseSiLu,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseTanh: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseTanh,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseTanh: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseTanh,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseHardSwish: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseHardSwish,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseHardSwish: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseHardSwish,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseGELU: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseGELU,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseGELU,
+  EpilogueScheduleType.TmaWarpSpecializedElementwiseFastGELU: EpilogueScheduleType.TmaWarpSpecializedBiasElementwiseFastGELU,
+  EpilogueScheduleType.TmaWarpSpecializedCooperativeElementwiseFastGELU: EpilogueScheduleType.TmaWarpSpecializedCooperativeBiasElementwiseFastGELU,
+}
+
 """
 )
 
diff --git a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
index 5a428bcbe..93ce36765 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/extra_gemm_emit.py
@@ -242,9 +242,13 @@ def emit(self, operation):
     """
     using ${operation_name}_base =
     cutlass::gemm::device::DualGemm<
-        ${element_a}, ${layout_a},
-        ${element_b}, ${layout_b},
-        ${element_c}, ${layout_c},
+        ${element_a},
+        ${layout_a},
+        ${element_b},
+        ${layout_b0},
+        ${layout_b1},
+        ${element_c},
+        ${layout_c},
         ${element_accumulator},
         ${opcode_class},
         ${arch},
@@ -298,7 +302,7 @@ def __init__(self, operation_suffix = ''):
       """
 
 
-  def emit(self, operation):
+  def emit(self, operation, broadcast_b1=False):
 
     threadblock_shape = operation.tile_description.threadblock_shape
     warp_count = operation.tile_description.warp_count
@@ -310,9 +314,10 @@ def emit(self, operation):
       LayoutType.RowMajor: LayoutType.ColumnMajor
     }
 
-    instance_layout_A, instance_layout_B, instance_layout_C = \
+    instance_layout_A, instance_layout_B0, instance_layout_C = \
       (operation.A.layout, operation.B.layout, operation.C.layout)
-    #
+    # B1 is broadcasted in column-major with zero stride (the latter set in the Arguments)
+    instance_layout_B1 = LayoutType.ColumnMajor if broadcast_b1 else instance_layout_B0
 
     # Support built-in epilogue functors or user-defined functions
     if isinstance(operation.epilogue_functor, enum.Enum):
@@ -340,7 +345,8 @@ def emit(self, operation):
       'element_a': DataTypeTag[operation.A.element],
       'layout_a': LayoutTag[instance_layout_A],
       'element_b': DataTypeTag[operation.B.element],
-      'layout_b': LayoutTag[instance_layout_B],
+      'layout_b0': LayoutTag[instance_layout_B0],
+      'layout_b1': LayoutTag[instance_layout_B1],
       'element_c': DataTypeTag[operation.C.element],
       'layout_c': LayoutTag[instance_layout_C],
       'element_accumulator': DataTypeTag[operation.accumulator_type()],
diff --git a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
index 2791fe05c..8231bc7b6 100644
--- a/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
+++ b/python/aitemplate/utils/mk_cutlass_lib/mk_cutlass_lib.py
@@ -18,7 +18,12 @@
 import shutil
 import tempfile
 
-from . import extra_conv_emit, extra_cutlass_generator, extra_enum, extra_gemm_emit
+from aitemplate.utils.mk_cutlass_lib import (
+    extra_conv_emit,
+    extra_cutlass_generator,
+    extra_enum,
+    extra_gemm_emit,
+)
 
 
 def mk_cutlass_lib(template_path, dst_prefix=None):
@@ -65,6 +70,8 @@ def process_code(src_path, dst_path, code_set):
 
     src_prefix = os.path.join(template_path, "tools/library/scripts")
     srcs = os.listdir(src_prefix)
+    if "__init__.py" in srcs:
+        srcs.remove("__init__.py")
     for file in srcs:
         src_path = os.path.join(src_prefix, file)
         if not os.path.isfile(src_path):
diff --git a/python/aitemplate/utils/serialization/ait_program.py b/python/aitemplate/utils/serialization/ait_program.py
index 4e4f62376..0c24052cc 100644
--- a/python/aitemplate/utils/serialization/ait_program.py
+++ b/python/aitemplate/utils/serialization/ait_program.py
@@ -23,6 +23,7 @@
     _TorchConstantTensorData,
 )
 from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def convert_to_ait_const(const):
@@ -65,26 +66,26 @@ def set_constants(self, constants: Dict[str, Any]):
         for k, v in constants.items():
             getattr(self, k)._bind_data(convert_to_ait_const(v))
 
-    def set_default_constants(self):
+    def set_default_constants(self, dtype="float16"):
         """
         This function is called to set up default constants
         (ex. constant folded/constants set up by zero padding etc.).
         """
-        self.set_all_random_constants()
+        self.set_all_random_constants(dtype)
 
-    def set_all_random_constants(self):
+    def set_all_random_constants(self, dtype="float16"):
         """
         This function would set all constants into random value.
         """
         const_infos = self.get_constants()
         for k, v in const_infos.items():
             getattr(self, k)._bind_data(
-                _NumpyConstantTensorData(np.random.randn(*v).astype("float16"))
+                _TorchConstantTensorData(get_random_torch_tensor(v, dtype))
             )
 
     def model(self) -> Union[Tensor, Tuple[Tensor]]:
         """
         This function defines the AIT program.
-        Returns a output tensor, or a tuple of output tensors.
+        Returns an output tensor, or a tuple of output tensors.
         """
         pass
diff --git a/python/aitemplate/utils/serialization/serdes_code.py b/python/aitemplate/utils/serialization/serdes_code.py
index 3114a39a4..cca28054a 100644
--- a/python/aitemplate/utils/serialization/serdes_code.py
+++ b/python/aitemplate/utils/serialization/serdes_code.py
@@ -15,7 +15,10 @@
 """
 Dump/Read sorted_graph to/from python code.
 """
+import copy
+import logging
 import os
+import sys
 
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -25,6 +28,7 @@
 
 from aitemplate.compiler.transform import mark_param_tensor, name_graph, toposort
 
+_LOGGER = logging.getLogger(__name__)
 PROGRAM_TEMPLATE = jinja2.Template(
     """import numpy as np
 
@@ -211,6 +215,12 @@ def _retrieve_op_info(op: Operator, params_set) -> Tuple[List, Dict]:
         # dynamic slice provides start/end indices as inputs
         op_inputs.append(str(op._attrs["start_indices"]))
         op_inputs.append(str(op._attrs["end_indices"]))
+    elif op._attrs["op"] == "permute":
+        # permute takes permuted dimensions as input,
+        # but can forward to static shape permute ops
+        # that don't (e.g., permute021 or permute102)
+        if "dims" in op._attrs:
+            op_inputs.append(str(op._attrs["dims"]))
 
     return op_inputs, op_attrs
 
@@ -304,6 +314,16 @@ def dump_program(
     """
     if isinstance(sorted_graph, Tensor):
         sorted_graph = [sorted_graph]
+    try:
+        sorted_graph = copy.deepcopy(sorted_graph)
+    except RecursionError:
+        default = sys.getrecursionlimit()
+        new_recursion_limit = default * 10
+        _LOGGER.info(
+            f"Recursion error when copying graph with default recursion limit {default}. Will try again with {new_recursion_limit}"
+        )
+        sys.setrecursionlimit(new_recursion_limit)
+        sorted_graph = copy.deepcopy(sorted_graph)
 
     # Make sure the graph is in correct order and has names and param set correctly.
     sorted_graph = toposort(sorted_graph)
@@ -358,7 +378,9 @@ def dump_program(
     )
 
     if file_path != "":
-        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        dirs = os.path.dirname(file_path)
+        if dirs != "":
+            os.makedirs(dirs, exist_ok=True)
         with open(file_path, "w") as f:
             f.write(program)
 
diff --git a/python/aitemplate/utils/shape_utils.py b/python/aitemplate/utils/shape_utils.py
index 75a857e07..d02c84112 100644
--- a/python/aitemplate/utils/shape_utils.py
+++ b/python/aitemplate/utils/shape_utils.py
@@ -16,10 +16,16 @@
 Util functions to handle shapes.
 """
 
-from typing import List
+from typing import List, Optional
 
+import sympy
 
-def gen_int_var(values: List[int], name: str = None):
+from aitemplate.compiler.base import IntVar, IntVarTensor, Tensor
+
+
+def gen_int_var(
+    values: List[int], name: str = None, symbolic_value: Optional[sympy.Basic] = None
+):
     """
     A helper function to generate IntImm or IntVar depending on the length of values.
     """
@@ -29,17 +35,21 @@ def gen_int_var(values: List[int], name: str = None):
     if len(values) == 1:
         return IntImm(values[0], name=name)
     elif len(values) > 1:
-        return IntVar(values, name=name)
+        return IntVar(values, name=name, symbolic_value=symbolic_value)
     else:
         raise RuntimeError("Unsupported dim definition: {}".format(values))
 
 
-def gen_int_var_min_max(values: List[int], name: str = None):
+def gen_int_var_min_max(
+    values: List[int], name: str = None, symbolic_value: Optional[sympy.Basic] = None
+):
     """
     A helper function to generate IntImm or IntVar depending on the length of values.
     Only keeps [min, max] pairs if there are more than 2 values.
     """
-    return gen_int_var([min(values), max(values)], name=name)
+    return gen_int_var(
+        [min(values), max(values)], name=name, symbolic_value=symbolic_value
+    )
 
 
 def get_broadcast_max_shape(shape1, shape2):
@@ -99,7 +109,7 @@ def get_num_rightmost_static_elements(shape, num_rightmost_dims: int = None) ->
     res = 1
 
     for idx, dim in enumerate(reversed(shape)):
-        if idx >= num_rightmost_dims:
+        if num_rightmost_dims is not None and idx >= num_rightmost_dims:
             break
         if not isinstance(dim, IntImm):
             break
@@ -147,6 +157,30 @@ def convert_shape_to_IntVar(shape):
     return ret
 
 
+def convert_shape_to_IntVarTensor(tensor: Tensor):
+    """
+    Map IntVars in the tensor's shape to their corresponding IntVarTensors, if any.
+    """
+    shape = tensor._attrs["shape"]
+    if not any(isinstance(v, IntVar) for v in shape):
+        return shape
+
+    intvar_to_tensor = {}
+    for op in tensor.src_ops():
+        for t in op._attrs["inputs"]:
+            if isinstance(t, IntVarTensor):
+                intvar_to_tensor[t._attrs["int_var"]] = t
+
+    ret = []
+    for v in shape:
+        # Using type() instead of isinstance() because we don't want to include IntImms
+        if type(v) is IntVar:
+            ret.append(intvar_to_tensor.get(v, v))
+        else:
+            ret.append(v)
+    return ret
+
+
 def convert_IntVar_to_int(var) -> int:
     """
     Try to convert an IntVar (or an IntVar wrapped in a IntVarTensor) to
@@ -185,3 +219,18 @@ def is_same_shape(shapes1, shapes2) -> bool:
         if dim1 != dim2:
             return False
     return True
+
+
+def get_static_stride(shape, dim) -> Optional[int]:
+    """
+    This is a helper function that returns the static stride for dim.
+    It returns None if it cannot generate a static stride.
+    """
+    from aitemplate.compiler.base import IntImm
+
+    stride = 1
+    for d in shape[dim + 1 :]:
+        if not isinstance(d, IntImm):
+            return None
+        stride *= d.value()
+    return stride
diff --git a/python/aitemplate/utils/tensor_utils.py b/python/aitemplate/utils/tensor_utils.py
index 66996010f..2042a44ec 100644
--- a/python/aitemplate/utils/tensor_utils.py
+++ b/python/aitemplate/utils/tensor_utils.py
@@ -21,8 +21,8 @@ def wrap_dim(idx, rank):
     """
     Wrap tensor index, idx, if it's negative.
     """
-    assert isinstance(idx, int)
+    assert isinstance(idx, int), "idx must be int, but got {}".format(type(idx))
     if idx < 0:
         idx = idx + rank
-    assert idx < rank
+    assert idx < rank, "idx {} out of range; rank {}".format(idx, rank)
     return idx
diff --git a/python/aitemplate/utils/torch_utils.py b/python/aitemplate/utils/torch_utils.py
index 266b3a279..c34c6b254 100644
--- a/python/aitemplate/utils/torch_utils.py
+++ b/python/aitemplate/utils/torch_utils.py
@@ -21,19 +21,66 @@
 `import torch` will work.
 """
 
+import struct
+
+import torch
+
+from aitemplate.compiler.dtype import dtype_str_to_enum, get_dtype_size, normalize_dtype
+
+
+def types_mapping():
+    from torch import bfloat16, bool, float16, float32, int32, int64
+
+    yield (float16, "float16")
+    yield (bfloat16, "bfloat16")
+    yield (float32, "float32")
+    yield (int32, "int32")
+    yield (int64, "int64")
+    yield (bool, "bool")
+
 
 def torch_dtype_to_string(dtype):
-    import torch
-
-    dtype_to_str = {
-        torch.float16: "float16",
-        torch.float32: "float32",
-        torch.int32: "int32",
-        torch.int64: "int64",
-        torch.bool: "bool",
-    }
-    if dtype not in dtype_to_str:
-        raise ValueError(
-            f"Got unsupported input dtype {dtype}! Supported dtypes are: {list(dtype_to_str.keys())}"
-        )
-    return dtype_to_str[dtype]
+    for torch_dtype, ait_dtype in types_mapping():
+        if dtype == torch_dtype:
+            return ait_dtype
+    raise ValueError(
+        f"Got unsupported input dtype {dtype}! "
+        f"Supported dtypes are: {list(types_mapping())}"
+    )
+
+
+def string_to_torch_dtype(string_dtype):
+    if string_dtype is None:
+        # Many torch functions take optional dtypes, so
+        # handling None is useful here.
+        return None
+
+    for torch_dtype, ait_dtype in types_mapping():
+        if string_dtype == ait_dtype:
+            return torch_dtype
+    raise ValueError(
+        f"Got unsupported ait dtype {string_dtype}! "
+        f"Supported dtypes are: {list(types_mapping())}"
+    )
+
+
+def write_tensor_binary(tensor: "torch.Tensor", file_handle) -> None:
+    tensor = tensor.detach().cpu().contiguous()
+    endianness = "@"  # system endianness
+    dtype_str = normalize_dtype(torch_dtype_to_string(tensor.dtype))
+    dtype_int = dtype_str_to_enum(dtype_str)
+    sizeof_dtype = get_dtype_size(dtype_str)
+    num_dims = len(tensor.shape)
+    file_handle.write(struct.pack(endianness + "I", dtype_int))  # unsigned int
+    file_handle.write(struct.pack(endianness + "I", sizeof_dtype))  # unsigned int
+    file_handle.write(struct.pack(endianness + "I", num_dims))  # unsigned int
+    total_size = sizeof_dtype
+    for dim in tensor.shape:
+        file_handle.write(struct.pack(endianness + "N", dim))  # size_t
+        total_size *= dim
+    file_handle.write(struct.pack(endianness + "N", total_size))  # size_t
+    bytedata = tensor.numpy().tobytes()
+    # just as a safety check
+    if len(bytedata) != total_size:
+        raise RuntimeError("Tensor has wrong number of bytes!")
+    file_handle.write(bytedata)
diff --git a/python/aitemplate/utils/visualization/__init__.py b/python/aitemplate/utils/visualization/__init__.py
index e6a2db339..7514939ef 100644
--- a/python/aitemplate/utils/visualization/__init__.py
+++ b/python/aitemplate/utils/visualization/__init__.py
@@ -13,6 +13,6 @@
 #  limitations under the License.
 #
 
-from .plot import plot_graph
+from aitemplate.utils.visualization.plot import plot_graph
 
 __all__ = ["plot_graph"]
diff --git a/python/aitemplate/utils/visualization/op_attr_factory.py b/python/aitemplate/utils/visualization/op_attr_factory.py
index 6049a1151..c0d91a3c8 100644
--- a/python/aitemplate/utils/visualization/op_attr_factory.py
+++ b/python/aitemplate/utils/visualization/op_attr_factory.py
@@ -13,9 +13,28 @@
 #  limitations under the License.
 #
 
+KEYS = [
+    "op",
+    "depth",
+    "nop",
+    "has_profiler",
+    "epilogue",
+    "epilogue_alignment",
+    "split_k",
+    "permute_shape",
+]
+
 
 def op_to_content(op):
     # TODO (XXX): Add op specialized attrs here, like gemm/conv
     content = {}
-    content["op_type"] = op._attrs["op"]
+    for k in KEYS:
+        v = op._attrs.get(k)
+        if v is not None and v != "":
+            content[k] = v
+
+    if op._attrs["op"] == "fused_elementwise":
+        content["func"] = ", ".join(
+            [str(x._attrs["func"]) for x in op._attrs["elementwise_ops"]]
+        )
     return content
diff --git a/python/aitemplate/utils/visualization/plot.py b/python/aitemplate/utils/visualization/plot.py
index 4ae0d5983..d0c21999a 100644
--- a/python/aitemplate/utils/visualization/plot.py
+++ b/python/aitemplate/utils/visualization/plot.py
@@ -18,15 +18,19 @@
 import json
 import os
 
+from typing import Optional
+
 from aitemplate import compiler
+from aitemplate.utils.environ import shorten_tensor_names_for_plots
+from aitemplate.utils.misc import short_str
 from aitemplate.utils.visualization import op_attr_factory, pydot
+from aitemplate.utils.visualization.op_attr_factory import op_to_content
 from aitemplate.utils.visualization.web_template import (
     INDEX_TEMPLATE,
     MODAL_TEMPLATE,
     TABLE_TEMPLATE,
 )
 
-
 COLOR_SCHEME = {
     "default_tensor": "lightskyblue1",
     "view": "plum1",
@@ -65,13 +69,14 @@ def _gen_tensor_modal(tensor) -> str:
         if tensor._attrs["is_view_of"] is None
         else tensor._attrs["is_view_of"]._attrs["name"]
     )
+    content["is_input"] = str(tensor._attrs["is_input"])
     content["is_output"] = str(tensor._attrs["is_output"])
     content["is_param"] = str(tensor._attrs["is_param"])
     content["dtype"] = str(tensor._attrs["dtype"])
     table_src = TABLE_TEMPLATE.render(table_data=content)
     modal_src = MODAL_TEMPLATE.render(
-        modal_id=tensor._attrs["name"] + "_modal",
-        modal_label=tensor._attrs["name"] + "_label",
+        modal_id=f'{tensor._attrs["name"]}_modal',
+        modal_label=f'{tensor._attrs["name"]}_label',
         modal_title=tensor._attrs["name"],
         modal_content=table_src,
     )
@@ -82,15 +87,39 @@ def _gen_op_modal(op) -> str:
     content = op_attr_factory.op_to_content(op)
     table_src = TABLE_TEMPLATE.render(table_data=content)
     modal_src = MODAL_TEMPLATE.render(
-        modal_id=op._attrs["name"] + "_modal",
-        modal_label=op._attrs["name"] + "_label",
+        modal_id=f'{op._attrs["name"]}_modal',
+        modal_label=f'{op._attrs["name"]}_label',
         modal_title=op._attrs["name"],
         modal_content=table_src,
     )
     return modal_src
 
 
-def plot_graph(tensors, file_path: str) -> None:
+def _highlight_op_node(op_node, op, time_stats):
+    if op in time_stats.op_durations:
+        perf_op = time_stats.op_durations[op]
+        scale_factor = float(perf_op) / float(time_stats.total_duration)
+
+        if perf_op > time_stats.duration_p95:
+            op_node.set("color", "maroon1")
+            op_node.set("penwidth", 9)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+        elif perf_op > time_stats.duration_p90:
+            op_node.set("color", "magenta1")
+            op_node.set("penwidth", 6)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+        elif perf_op > time_stats.duration_p70:
+            op_node.set("color", "mediumorchid1")
+            op_node.set("penwidth", 3)
+            op_node.set("width", 1 + scale_factor * 100)
+            op_node.set("height", 1 + scale_factor * 50)
+
+
+def plot_graph(
+    tensors, file_path: str, file_with_time_profiles: Optional[str] = None
+) -> None:
     """
     Plot AIT graph.
 
@@ -102,6 +131,9 @@ def plot_graph(tensors, file_path: str) -> None:
         Output file path, currently we support the following extension:
             - html
             - format supported by graphviz
+    file_with_time_profile : Optional[str]
+        Adds time for every node, if provided
+
     """
     dot_graph = pydot.Dot(graph_type="digraph")
     _, ext = os.path.splitext(file_path)
@@ -115,6 +147,25 @@ def plot_graph(tensors, file_path: str) -> None:
     sorted_graph = compiler.transform.toposort(tensors)
     compiler.transform.name_graph(sorted_graph)
 
+    # Before doing the further processing, it is needed
+    # to find whether there is an Operator instance with the same
+    # name like 'fused_elementwise_123' that is used
+    # several times, but with different input and/or outputs.
+    # In such a case, every Operator instance should get its unique
+    # name.
+    #
+    # The following dict will be used to store such unique names,
+    # such as 'fused_elementwise_123 0' and 'fused_elementwise_123 1'.
+    from aitemplate.utils.json_utils import gen_unique_op_names
+
+    op_names = gen_unique_op_names(sorted_graph)
+
+    from aitemplate.utils.graph_utils import ProfiledTimeStatistics, track_graph_timings
+
+    time_stats = ProfiledTimeStatistics()
+    if file_with_time_profiles is not None:
+        time_stats = track_graph_timings(sorted_graph, file_with_time_profiles)
+
     op_set = {}
     tensor_set = {}
     modal_set = []
@@ -123,6 +174,7 @@ def plot_graph(tensors, file_path: str) -> None:
     for tensor in sorted_graph:
         tensor_node = None
         tensor_name = tensor._attrs["name"]
+
         if tensor in tensor_set:
             tensor_node = tensor_set[tensor]
         else:
@@ -133,65 +185,123 @@ def plot_graph(tensors, file_path: str) -> None:
                 color = COLOR_SCHEME["output"]
             if tensor._attrs["is_param"] is True:
                 color = COLOR_SCHEME["param"]
+
+            label = tensor_name
+
+            if shorten_tensor_names_for_plots():
+                if tensor_name is not None and len(tensor_name) > 30:
+                    label = short_str(tensor_name)
+
+            # add a label with time
+            label_with_time = ""
+            seq_tracker = time_stats.tensor_sequential_trackers.get(tensor, None)
+            if seq_tracker is not None and seq_tracker.execution_end != 0:
+                label_with_time += f"{seq_tracker.execution_end:.3f} ms"
+
+            par_tracker = time_stats.tensor_parallel_trackers.get(tensor, None)
+            if par_tracker is not None and par_tracker.execution_end != 0:
+                if label_with_time:
+                    label_with_time += " / "
+                label_with_time += f"{par_tracker.execution_end:.3f} ms"
+
+            if label_with_time:
+                label = f"{tensor_name}\\n{label_with_time}"
+
+            # add a node
             tensor_node = pydot.Node(
                 name=tensor_name,
                 shape="note",
                 id=tensor_name,
+                label=label,
                 color=color,
             )
             tensor_set[tensor] = tensor_node
             dot_graph.add_node(tensor_node)
             modal_set.append(_gen_tensor_modal(tensor))
             items.append(tensor_name)
-            popover_data[tensor_name] = "shape: " + _get_tensor_shape_str(tensor)
+
+            popover_data[tensor_name] = f"shape: {_get_tensor_shape_str(tensor)}"
 
         for src_op in tensor.src_ops():
             op_node = None
             op_name = src_op._attrs["name"]
+
+            # replace op_name with a unique name, if provided
+            if op_name is not None:
+                op_name = op_names.get(src_op, op_name)
+
             if src_op in op_set:
                 op_node = op_set[src_op]
             else:
+                label = (
+                    f"{op_name}\\n{str(time_stats.op_durations[src_op])} ms"
+                    if src_op in time_stats.op_durations
+                    else op_name
+                )
                 op_node = pydot.Node(
                     name=op_name,
                     shape="folder",
                     id=op_name,
+                    label=label,
                     color="mediumpurple1",
                 )
+                _highlight_op_node(op_node, src_op, time_stats)
+
                 op_set[src_op] = op_node
                 dot_graph.add_node(op_node)
                 modal_set.append(_gen_op_modal(src_op))
                 items.append(op_name)
-                popover_data[op_name] = "op: " + src_op._attrs["op"]
+                popover_data[op_name] = ", ".join(
+                    [f"{x}: {y}" for x, y in op_to_content(src_op).items()]
+                )
             dot_graph.add_edge(pydot.Edge(op_node, tensor_node))
 
         for dst_op in tensor.dst_ops():
             op_node = None
             op_name = dst_op._attrs["name"]
+
+            # replace op_name with a unique name, if provided
+            if op_name is not None:
+                op_name = op_names.get(dst_op, op_name)
+
             if dst_op in op_set:
                 op_node = op_set[dst_op]
             else:
+                label = (
+                    f"{op_name}\\n{str(time_stats.op_durations[dst_op])} ms"
+                    if dst_op in time_stats.op_durations
+                    else op_name
+                )
                 op_node = pydot.Node(
                     name=op_name,
                     shape="folder",
                     id=op_name,
+                    label=label,
                     color="mediumpurple1",
                 )
+                _highlight_op_node(op_node, dst_op, time_stats)
+
                 op_set[dst_op] = op_node
                 dot_graph.add_node(op_node)
                 items.append(op_name)
-                popover_data[op_name] = "op: " + dst_op._attrs["op"]
+
+                popover_data[op_name] = ", ".join(
+                    [f"{x}: {y}" for x, y in op_to_content(dst_op).items()]
+                )
                 # add modal
                 modal_set.append(_gen_op_modal(dst_op))
             dot_graph.add_edge(pydot.Edge(tensor_node, op_node))
 
-    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    file_dir = os.path.dirname(file_path)
+    if file_dir:
+        os.makedirs(file_dir, exist_ok=True)
 
     if ext == "html":
         basename = os.path.splitext(os.path.basename(file_path))[0]
         dot_src = dot_graph.to_string()
         modal_src = "\n".join(modal_set)
-        items_src = [f'"{item}"' for item in items]
-        popover_src = json.dumps(popover_data)
+        items_src = json.dumps(items, indent=2)
+        popover_src = json.dumps(popover_data, indent=2)
         index = INDEX_TEMPLATE.render(
             dot_src=dot_src,
             modals=modal_src,
diff --git a/python/aitemplate/utils/visualization/pydot.py b/python/aitemplate/utils/visualization/pydot.py
index adcec96d1..e580fc611 100644
--- a/python/aitemplate/utils/visualization/pydot.py
+++ b/python/aitemplate/utils/visualization/pydot.py
@@ -439,6 +439,9 @@ def quote_if_necessary(s):
             return "True"
         return "False"
 
+    if s is None:
+        return f"{s}"
+
     if not isinstance(s, str):
         return s
 
@@ -451,7 +454,7 @@ def quote_if_necessary(s):
             "\n": r"\n",
             "\r": r"\r",
         }
-        for (a, b) in replace.items():
+        for a, b in replace.items():
             s = s.replace(a, b)
 
         return '"' + s + '"'
@@ -505,7 +508,6 @@ def graph_from_edges(edge_list, node_prefix="", directed=False):
         graph = Dot(graph_type="graph")
 
     for edge in edge_list:
-
         if isinstance(edge[0], str):
             src = node_prefix + edge[0]
         else:
@@ -599,7 +601,7 @@ def graph_from_incidence_matrix(matrix, node_prefix="", directed=False):
     return graph
 
 
-class Common(object):
+class Common:
     """Common information to several classes.
 
     Should not be directly used, several classes are derived from
@@ -729,11 +731,9 @@ def __init__(self, name="", obj_dict=None, **attrs):
         # as if they were Node definitions
         #
         if obj_dict is not None:
-
             self.obj_dict = obj_dict
 
         else:
-
             self.obj_dict = dict()
 
             # Copy the attributes
@@ -893,7 +893,6 @@ def __eq__(self, edge):
             raise pydot.Error("Can not compare an edge to a non-edge object.")
 
         if self.get_parent_graph().get_top_graph_type() == "graph":
-
             # If the graph is undirected, the edge has neither
             # source nor destination.
             #
@@ -920,7 +919,6 @@ def parse_node_ref(self, node_str):
             return node_str
 
         if node_str.startswith('"') and node_str.endswith('"'):
-
             return node_str
 
         node_port_idx = node_str.rfind(":")
@@ -1041,7 +1039,6 @@ def __init__(
             self.obj_dict = obj_dict
 
         else:
-
             self.obj_dict = dict()
 
             self.obj_dict["attributes"] = dict(attrs)
@@ -1236,7 +1233,6 @@ def del_node(self, name, index=None):
             name = name.get_name()
 
         if name in self.obj_dict["nodes"]:
-
             if index is not None and index < len(self.obj_dict["nodes"][name]):
                 del self.obj_dict["nodes"][name][index]
                 return True
@@ -1259,7 +1255,6 @@ def get_node(self, name):
         match = list()
 
         if name in self.obj_dict["nodes"]:
-
             match.extend(
                 [Node(obj_dict=obj_dict) for obj_dict in self.obj_dict["nodes"][name]]
             )
@@ -1409,7 +1404,6 @@ def add_subgraph(self, sgraph):
             )
 
         if sgraph.get_name() in self.obj_dict["subgraphs"]:
-
             sgraph_list = self.obj_dict["subgraphs"][sgraph.get_name()]
             sgraph_list.append(sgraph.obj_dict)
 
@@ -1432,7 +1426,6 @@ def get_subgraph(self, name):
         match = list()
 
         if name in self.obj_dict["subgraphs"]:
-
             sgraphs_obj_dict = self.obj_dict["subgraphs"].get(name)
 
             for obj_dict_list in sgraphs_obj_dict:
@@ -1484,9 +1477,7 @@ def to_string(self):
         graph = list()
 
         if self.obj_dict.get("strict", None) is not None:
-
             if self == self.get_parent_graph() and self.obj_dict["strict"]:
-
                 graph.append("strict ")
 
         graph_type = self.obj_dict["type"]
@@ -1496,9 +1487,7 @@ def to_string(self):
         graph.append(s)
 
         for attr in sorted(self.obj_dict["attributes"]):
-
             if self.obj_dict["attributes"].get(attr, None) is not None:
-
                 val = self.obj_dict["attributes"].get(attr)
                 if val == "":
                     val = '""'
@@ -1538,12 +1527,10 @@ def to_string(self):
         obj_list.sort(key=lambda x: x[0])
 
         for idx, obj in obj_list:
-
             if obj["type"] == "node":
                 node = Node(obj_dict=obj)
 
                 if self.obj_dict.get("suppress_disconnected", False):
-
                     if (
                         node.get_name() not in edge_src_set
                         and node.get_name() not in edge_dst_set
@@ -1623,7 +1610,6 @@ def __init__(
         )
 
         if obj_dict is None:
-
             self.obj_dict["type"] = "subgraph"
 
 
@@ -1677,7 +1663,6 @@ def __init__(
         )
 
         if obj_dict is None:
-
             self.obj_dict["type"] = "subgraph"
             self.obj_dict["name"] = quote_if_necessary("cluster_" + graph_name)
 
diff --git a/python/aitemplate/utils/visualization/web_template.py b/python/aitemplate/utils/visualization/web_template.py
index 3f1f1c920..fe98c074f 100644
--- a/python/aitemplate/utils/visualization/web_template.py
+++ b/python/aitemplate/utils/visualization/web_template.py
@@ -135,7 +135,7 @@
   
 
   <script>
-  items = [{{items|join(", ")}}];
+  items = {{items}};
   function autocomplete(inp, arr) {
   /*the autocomplete function takes two arguments,
   the text field element and an array of possible autocompleted values:*/
@@ -162,7 +162,7 @@
           /*make the matching letters bold:*/
           b.innerHTML = "<strong>" + arr[i].substr(0, val.length) + "</strong>";
           b.innerHTML += arr[i].substr(val.length);
-          /*insert a input field that will hold the current array item's value:*/
+          /*insert an input field that will hold the current array item's value:*/
           b.innerHTML += "<input type='hidden' value='" + arr[i] + "'>";
           /*execute a function when someone clicks on the item value (DIV element):*/
               b.addEventListener("click", function(e) {
diff --git a/python/setup.py b/python/setup.py
index df01212e3..dcf764c35 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -79,7 +79,14 @@ def gen_cutlass_list():
         "aitemplate/3rdparty/cutlass/examples",
         "aitemplate/3rdparty/cutlass/tools/util/include",
     ]
-    f_cond = lambda x: True if x.endswith(".h") or x.endswith(".cuh") else False
+    f_cond = (
+        lambda x: True
+        if x.endswith(".h")
+        or x.endswith(".cuh")
+        or x.endswith(".hpp")
+        or x.endswith(".inl")
+        else False
+    )
     return gen_file_list(srcs, f_cond)
 
 
@@ -128,8 +135,12 @@ def gen_utils_file_list():
 
 
 def gen_backend_common_file_list():
-    srcs = ["aitemplate/backend/common"]
-    f_cond = lambda x: True if x.endswith(".py") or x.endswith(".cuh") else False
+    srcs = ["aitemplate/backend"]
+    f_cond = (
+        lambda x: True
+        if x.endswith(".py") or x.endswith(".cuh") or x.endswith(".h")
+        else False
+    )
     return gen_file_list(srcs, f_cond)
 
 
@@ -149,7 +160,7 @@ def gen_license_file_list():
     version=__version__,
     description="AITemplate: Make Templates Great for AI",
     zip_safe=True,
-    install_requires=["jinja2", "numpy"],
+    install_requires=["jinja2", "numpy", "sympy"],
     packages=find_packages(),
     package_data={
         "aitemplate": [
@@ -161,7 +172,6 @@ def gen_license_file_list():
             "backend/cuda/vision_ops/nms/batched_nms_kernel.cuh",
             "backend/cuda/vision_ops/nms/nms_kernel.cuh",
             "backend/cuda/vision_ops/roi_ops/multi_level_roi_align.cuh",
-            "backend/rocm/elementwise/custom_math.h",
         ]
         + gen_utils_file_list()
         + gen_cutlass_list()
diff --git a/static/README.md b/static/README.md
index 97c3f1b81..d1ffafaaa 100644
--- a/static/README.md
+++ b/static/README.md
@@ -55,7 +55,7 @@ If PyTorch is not available, `Model` provides a set of functions for copying, al
 # Arguments as a dictionary
 module.run(
   {"input0": in0_ait, "input1": in1_ait},
-  {"output0": out0_ait, "output1": out0_ait},
+  {"output0": out0_ait, "output1": out1_ait},
 )
 
 # Arguments as an ordered list. Note that you might need to query
@@ -63,8 +63,8 @@ module.run(
 input_name_to_idx = module.get_input_name_to_index_map()
 output_name_to_idx = module.get_output_name_to_index_map()
 
-inputs = [None for i in range(len(input_name_to_idx))]
-outputs = [None for i in range(len(input_name_to_idx))]
+inputs = [None] * len(input_name_to_idx)
+outputs = [None] * len(output_name_to_idx)
 
 for name in input_name_to_idx:
   inputs[input_name_to_idx[name]] = ait_inputs[name]
diff --git a/static/csrc/debug_utility.cpp b/static/csrc/debug_utility.cpp
index 3ebaf1879..aca8099bf 100644
--- a/static/csrc/debug_utility.cpp
+++ b/static/csrc/debug_utility.cpp
@@ -35,7 +35,7 @@ __global__ void inf_and_nan_checker(const half* tensor, int64_t elem_cnt) {
   }
   if (nan_num > 0 || pos_inf > 0 || neg_inf > 0) {
     printf(
-        "contains NaN: %ld, +INF: %ld, -INF: %ld, total elements: %ld\n",
+        "contains NaN: %lld, +INF: %lld, -INF: %lld, total elements: %lld\n",
         nan_num,
         pos_inf,
         neg_inf,
@@ -45,16 +45,6 @@ __global__ void inf_and_nan_checker(const half* tensor, int64_t elem_cnt) {
   }
 }
 
-__global__ void outputs_checker(const half* tensor, int64_t elem_cnt) {
-  for (int64_t i = 0; i < elem_cnt; i++) {
-    float v = (float)(*(tensor + i));
-    if (i != 0) {
-      printf(", ");
-    }
-    printf("%f", v);
-  }
-  printf("\n");
-}
 } // namespace
 
 namespace ait {
@@ -68,13 +58,4 @@ void InvokeInfAndNanChecker(
   ait::StreamSynchronize(stream);
 }
 
-void InvokeOutputsChecker(
-    const half* tensor,
-    const char* tensor_name,
-    int64_t elem_cnt,
-    ait::StreamType stream) {
-  printf("Tensor (%s) output:\n", tensor_name);
-  outputs_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
-  ait::StreamSynchronize(stream);
-}
 } // namespace ait
diff --git a/static/csrc/model_container.cpp b/static/csrc/model_container.cpp
index 3bfaa338e..085ad0b58 100644
--- a/static/csrc/model_container.cpp
+++ b/static/csrc/model_container.cpp
@@ -17,20 +17,41 @@
 #include "device_functions-generated.h"
 #include "raii_wrapper.h"
 
+namespace {
+std::string GetEnumString(AITemplateDtype dtype) {
+  switch (dtype) {
+    case AITemplateDtype::kUnset:
+      return "kUnset";
+    case AITemplateDtype::kHalf:
+      return "kHalf";
+    case AITemplateDtype::kFloat:
+      return "kFloat";
+    case AITemplateDtype::kInt:
+      return "kInt";
+    case AITemplateDtype::kLong:
+      return "kLong";
+    case AITemplateDtype::kBFloat16:
+      return "kBFloat16";
+    default:
+      return "unknown";
+  }
+}
+} // namespace
+
 namespace ait {
 
 ModelContainer::ModelContainer(
     size_t num_models,
-    size_t blob_size,
-    size_t workspace_size,
     size_t num_inputs,
     size_t num_outputs,
+    size_t num_bound_constants,
     size_t num_unbound_constants,
     size_t params_size,
     AITemplateAllocator& allocator)
     : ModelContainerBase(
           num_inputs,
           num_outputs,
+          num_bound_constants,
           num_unbound_constants,
           params_size,
           allocator),
@@ -40,19 +61,45 @@ ModelContainer::ModelContainer(
   if (num_models == 0) {
     throw std::runtime_error("Number of models must be positive");
   }
+  dmlc::InitLogging("aitemplate"); // TODO(xxx): render network name
+  int runtime_version;
+  int driver_version;
+  DEVICE_CHECK(GetDriverVersion(&driver_version));
+  DEVICE_CHECK(GetRuntimeVersion(&runtime_version));
+  LOG(INFO) << "Device Runtime Version: " << runtime_version
+            << "; Driver Version: " << driver_version;
+
+  int dev_id;
+  DevicePropertyType prop;
+  DEVICE_CHECK(GetDevice(&dev_id));
+  DEVICE_CHECK(GetDeviceProperties(&prop, dev_id));
+
+  bool useDebugLogging = false;
+  if (auto var = std::getenv("LOGLEVEL")) {
+    if (var[0] == 'd' || var[0] == 'D') {
+      useDebugLogging = true;
+    }
+  }
+  LOG(INFO)
+      << (useDebugLogging ? PrintDebugDeviceProperties(prop)
+                          : PrintInfoDeviceProperties(prop));
+
+  LOG(INFO) << "Init AITemplate Runtime with " << num_models << " concurrency";
   models_.reserve(num_models);
   available_models_.reserve(num_models);
 
+  auto* constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
   for (size_t i = 0; i < num_models; ++i) {
-    models_.emplace_back(
-        blob_size,
-        workspace_size,
-        num_inputs,
-        num_outputs,
-        num_unbound_constants,
-        static_cast<uint8_t*>(constants_.get()),
-        allocator);
-    available_models_.push_back(&models_.back());
+    models_.push_back(Model::Create(allocator, constants_ptr));
+    available_models_.push_back(models_.back().get());
+  }
+
+  constant_folder_ = ConstantFolder::Create(allocator, constants_ptr);
+
+  // Wire up the constant folder's outputs to our constant buffer.
+  size_t constant_idx = 0;
+  for (auto offset : constant_folding_outputs_offsets_) {
+    constant_folder_->SetOutput(constants_ptr + offset, constant_idx++);
   }
 }
 
@@ -65,6 +112,20 @@ void ModelContainer::Run(
     bool sync,
     bool graph_mode,
     int64_t** output_shapes_out) {
+  std::shared_lock constants_lk(constants_sync_mutex_);
+  if (!constant_folded_once_) {
+    // We don't require users to manually call FoldConstants the first time.
+    // Note that if this throws (due to an unset constant, for example)
+    // constant_folded_once_ will not be set.
+    constants_lk.unlock();
+    std::unique_lock constants_unique_lk(constants_sync_mutex_);
+    // Check again, another thread may have updated after we unlocked.
+    if (!constant_folded_once_) {
+      FoldConstantsImpl(stream);
+    }
+    constants_unique_lk.unlock();
+    constants_lk.lock();
+  }
   auto* model = GetAvailableModel();
   try {
     PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
@@ -92,6 +153,34 @@ void ModelContainer::Run(
   }
 }
 
+void ModelContainer::Profile(
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    StreamType stream,
+    size_t num_iters,
+    const char* filename) {
+  auto* model = GetAvailableModel();
+  if (filename == nullptr) {
+    throw;
+  }
+  try {
+    PrepareForRun(model, inputs, num_inputs, outputs, num_outputs);
+    model->Profile(stream, num_iters, filename);
+  } catch (...) {
+    std::lock_guard lk(models_mutex_);
+    available_models_.push_back(model);
+    throw;
+  }
+
+  {
+    std::lock_guard lk(models_mutex_);
+    pending_models_.push_back(model);
+  }
+  pending_models_available_.notify_one();
+}
+
 void ModelContainer::RunWithOutputsOnHost(
     const AITData* inputs,
     size_t num_inputs,
@@ -148,6 +237,18 @@ float ModelContainer::Benchmark(
     num_threads = std::thread::hardware_concurrency();
   }
 
+  std::shared_lock constants_lk(constants_sync_mutex_);
+  if (!constant_folded_once_) {
+    constants_lk.unlock();
+    std::unique_lock constants_unique_lk(constants_sync_mutex_);
+    // Check again, another thread may have updated after we unlocked.
+    if (!constant_folded_once_) {
+      FoldConstantsImpl(stream);
+    }
+    constants_unique_lk.unlock();
+    constants_lk.lock();
+  }
+
   if (num_threads == 1) {
     return BenchmarkImpl(
                inputs,
@@ -262,32 +363,174 @@ float ModelContainer::Benchmark(
   return max_time / total_num_iters;
 }
 
+void ModelContainer::SetConstantImpl(
+    const char* name,
+    const AITData& tensor,
+    bool double_buffer,
+    StreamType stream) {
+  auto unbound_it = unbound_constant_name_to_idx_.find(name);
+  auto bound_it = bound_constant_name_to_idx_.find(name);
+  if (unbound_it != unbound_constant_name_to_idx_.end()) {
+    auto constant_idx = unbound_it->second + num_inputs_ + num_outputs_;
+    ValidateParamDtype(tensor.dtype, constant_idx);
+
+    CHECK_VECTOR_ACCESS(max_param_storage_bytes_, constant_idx)
+    auto expected_num_bytes = max_param_storage_bytes_[constant_idx];
+    auto actual_num_bytes =
+        tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
+    if (expected_num_bytes != actual_num_bytes) {
+      throw std::runtime_error(
+          std::string(
+              "SetConstant did not receive correct number of bytes for unbound constant ") +
+          name + ": expected " + std::to_string(expected_num_bytes) +
+          " but got " + std::to_string(actual_num_bytes) +
+          ". Check that the provided tensor's shape is correct.");
+    }
+  } else if (bound_it != bound_constant_name_to_idx_.end()) {
+    auto constant_idx = bound_it->second;
+    ValidateBoundConstantDtype(tensor.dtype, constant_idx);
+
+    CHECK_VECTOR_ACCESS(bound_constant_size_, constant_idx)
+    auto expected_num_bytes = bound_constant_size_[constant_idx];
+    auto actual_num_bytes =
+        tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
+    if (expected_num_bytes != actual_num_bytes) {
+      throw std::runtime_error(
+          std::string(
+              "SetConstant did not receive correct number of bytes for bound constant ") +
+          name + ": expected " + std::to_string(expected_num_bytes) +
+          " but got " + std::to_string(actual_num_bytes) +
+          ". Check that the provided tensor's shape is correct.");
+    }
+  } else {
+    LOG(WARNING) << "Called SetConstant on " << name
+       << " but can't find in either bound or unbound constant set";
+    return;
+  }
+
+  auto* src = tensor.ptr;
+  bool is_constant_folder_ =
+      constant_folding_inputs_.find(name) != constant_folding_inputs_.end() ||
+      constant_folding_optional_inputs_.find(name) !=
+          constant_folding_optional_inputs_.end();
+
+  if (!double_buffer) {
+    // If we don't use double_buffer, we can just SetConstant.
+    if (!is_constant_folder_) {
+      for (auto& model : models_) {
+        model->SetConstant(name, src);
+      }
+    } else {
+      constant_folder_->SetConstant(name, src);
+    }
+  } else {
+    // If we use double buffer, we identify whether it's a bounded constant or
+    // not. If it's unbounded, just hold the pointer. It it's bounded, we copy
+    // it into the constant buffer.
+    if (unbound_it != unbound_constant_name_to_idx_.end()) {
+      if (is_constant_folder_) {
+        constant_folder_->SetConstant(name, src);
+      } else {
+        model_constants_[std::string(name)] = src;
+      }
+    } else {
+      // Constant to be set is bounded, preload into constant buffer.
+      uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+      size_t idx = bound_it->second;
+      // TODO: check whether src is host or device memory.
+      DEVICE_CHECK(DeviceToDeviceCopy(
+          constants_ptr + bound_constant_offsets_[idx],
+          src,
+          bound_constant_size_[idx],
+          stream));
+    }
+  }
+
+  buffer_state_ = BufferState::CONSTANTS_UPDATED;
+}
+
 void ModelContainer::SetConstant(const char* name, const AITData& tensor) {
-  auto it = unbound_constant_name_to_idx_.find(name);
-  if (it == unbound_constant_name_to_idx_.end()) {
-    // TODO make this an exception after we fix the CMF benchmarks
-    LOG(ERROR) << "Constant " << name << " not found";
+  std::lock_guard lk(constants_sync_mutex_);
+  WaitForAllModels(/*include_constant_folder=*/true);
+  SetConstantImpl(name, tensor);
+}
+
+void ModelContainer::SetManyConstants(
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  if (num_tensors == 0) {
     return;
   }
-  auto constant_idx = it->second + num_inputs_ + num_outputs_;
-  ValidateDtype(tensor.dtype, constant_idx);
 
-  CHECK_VECTOR_ACCESS(max_param_storage_bytes_, constant_idx)
-  auto expected_num_bytes = max_param_storage_bytes_[constant_idx];
-  auto actual_num_bytes =
-      tensor.shape.Numel() * AITemplateDtypeSizeBytes(tensor.dtype);
-  if (expected_num_bytes != actual_num_bytes) {
-    throw std::runtime_error(
-        std::string(
-            "SetConstant did not recieve correct number of bytes for constant ") +
-        name + ": expected " + std::to_string(expected_num_bytes) +
-        " but got " + std::to_string(actual_num_bytes) +
-        ". Check that the provided tensor's shape is correct.");
+  if (tensors == nullptr) {
+    throw std::runtime_error("Tensor array cannot be null");
   }
 
-  auto* src = tensor.ptr;
-  for (auto& model : models_) {
-    model.SetConstant(name, src);
+  std::lock_guard lk(constants_sync_mutex_);
+  WaitForAllModels(/*include_constant_folder=*/true);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    const char* name = names[i];
+    if (name == nullptr) {
+      throw std::runtime_error("Constant name cannot be null");
+    }
+    const auto& tensor = tensors[i];
+    SetConstantImpl(names[i], tensor);
+  }
+}
+
+void ModelContainer::SwapConstantFolderBuffer() {
+  uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+  constant_folder_->ResetConstants(constants_ptr);
+  size_t constant_idx = 0;
+  for (auto offset : constant_folding_outputs_offsets_) {
+    constant_folder_->SetOutput(constants_ptr + offset, constant_idx++);
+  }
+}
+
+uint8_t* ModelContainer::GetInactiveConstantsBuffer() {
+  uint8_t* constants_ptr{nullptr};
+  if (use_constants_primary_buffer_) {
+    if (constants_secondary_ == nullptr) {
+      constants_secondary_ = RAII_DeviceMalloc(constants_size_, allocator_);
+    }
+    constants_ptr = static_cast<uint8_t*>(constants_secondary_.get());
+  } else {
+    constants_ptr = static_cast<uint8_t*>(constants_primary_.get());
+  }
+  return constants_ptr;
+}
+
+void ModelContainer::SetDoubleBufferConstant(
+    const char* name,
+    const AITData& tensor,
+    StreamType stream) {
+  std::lock_guard lk(constants_double_buffer_mutex_);
+  SetConstantImpl(name, tensor, /* double_buffer */ true, stream);
+}
+
+void ModelContainer::SetManyDoubleBufferConstants(
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors,
+    StreamType stream) {
+  if (num_tensors == 0) {
+    return;
+  }
+
+  if (tensors == nullptr) {
+    throw std::runtime_error("Tensor array cannot be null");
+  }
+
+  std::lock_guard lk(constants_double_buffer_mutex_);
+  for (size_t i = 0; i < num_tensors; ++i) {
+    const char* name = names[i];
+    if (name == nullptr) {
+      throw std::runtime_error("Constant name cannot be null");
+    }
+    const auto& tensor = tensors[i];
+    SetConstantImpl(names[i], tensor, /* double_buffer */ true, stream);
   }
 }
 
@@ -296,10 +539,24 @@ size_t ModelContainer::NumInputs() const {
 }
 
 const char* ModelContainer::InputName(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
   CHECK_VECTOR_ACCESS(param_names_, input_idx)
   return param_names_[input_idx];
 }
 
+AITemplateParamShape ModelContainer::MaxInputShape(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
+  CHECK_VECTOR_ACCESS(max_param_shapes_, input_idx)
+  auto& input_shape = max_param_shapes_[input_idx];
+  return AITemplateParamShape{input_shape.data(), input_shape.size()};
+}
+
+AITemplateDtype ModelContainer::InputDtype(size_t input_idx) const {
+  CHECK(input_idx < num_inputs_);
+  CHECK_VECTOR_ACCESS(param_dtypes_, input_idx)
+  return param_dtypes_[input_idx];
+}
+
 size_t ModelContainer::NumOutputs() const {
   return num_outputs_;
 }
@@ -329,6 +586,147 @@ size_t ModelContainer::MaxOutputStorageBytes(size_t output_idx) const {
   return max_param_storage_bytes_[idx];
 }
 
+void ModelContainer::WaitForAllModels(bool include_constant_folder) {
+  // Wait for all on-going inferences to finish.
+  for (auto* model : pending_models_) {
+    try {
+      model->WaitForCompletion();
+      // Something has gone horribly wrong if we hit these catch cases, but
+      // there's not much we can do about it. Just put the model back into the
+      // pool and carry on with folding.
+    } catch (std::exception& e) {
+      LOG(WARNING)
+          << "Model threw exception when waiting for inference to finish: "
+          << e.what() << ". Ignoring and continuing constant folding.";
+    } catch (...) {
+      LOG(WARNING)
+          << "Model threw unknown exception when waiting for inference to finish. Ignoring and continuing constant foldng.";
+    }
+    available_models_.push_back(model);
+  }
+
+  if (include_constant_folder) {
+    try {
+      constant_folder_->WaitForCompletion();
+    } catch (...) {
+      LOG(WARNING)
+          << "Constant folder threw exception while waiting for completion, ignoring.";
+    }
+  }
+}
+
+void ModelContainer::FoldConstantsImpl(StreamType stream, bool double_buffer) {
+  if (constant_folded_once_) {
+    // We do not set the buffer state if this is the initial constant folding.
+    buffer_state_ = BufferState::CONSTANTS_FOLDED;
+  }
+
+  if (double_buffer) {
+    SwapConstantFolderBuffer();
+  } else {
+    // NB: No need to acquire models_mutex_ here. We're guaranteed that nothing
+    //     will be concurrently messing with the Model vectors while we hold
+    //     the constants_sync_mutex_ in unique mode. See model_container.h for
+    //     the full explanation.
+    WaitForAllModels();
+  }
+  // We might have already started constant folding, make sure it finishes
+  // first. It's OK if we throw here, there's no state to restore.
+  // We just won't finish the folding and will need to do it again.
+  constant_folder_->WaitForCompletion();
+  if (double_buffer) {
+    std::lock_guard constants_unique_lk(constants_double_buffer_mutex_);
+    constant_folder_->Run(stream, /*graph_mode=*/false);
+  } else {
+    constant_folder_->Run(stream, /*graph_mode=*/false);
+  }
+  constant_folded_once_ = true;
+}
+
+void ModelContainer::FoldConstants(
+    StreamType stream,
+    bool sync,
+    bool double_buffer) {
+  if (double_buffer) {
+    FoldConstantsImpl(stream, double_buffer);
+  } else {
+    std::lock_guard constant_folding_lk(constants_sync_mutex_);
+    FoldConstantsImpl(stream);
+  }
+  if (sync) {
+    DEVICE_CHECK(StreamSynchronize(stream));
+  }
+}
+
+void ModelContainer::SwapConstants() {
+  if (buffer_state_ != BufferState::CONSTANTS_FOLDED) {
+    LOG(WARNING) << "Called SwapConstants without calling FoldConstants().";
+    return;
+  }
+  std::unique_lock constants_unique_lk(constants_double_buffer_mutex_);
+  uint8_t* constants_ptr = GetInactiveConstantsBuffer();
+  use_constants_primary_buffer_ = !use_constants_primary_buffer_;
+
+  for (auto& model : models_) {
+    model->ResetConstants(constants_ptr);
+  }
+  for (auto& [name, src] : model_constants_) {
+    for (auto& model : models_) {
+      model->SetConstant(name.c_str(), src);
+    }
+  }
+
+  model_constants_.clear();
+  buffer_state_ = BufferState::CLEAN;
+}
+
+size_t ModelContainer::GetNumConstants(bool unbound_constants_only) const {
+  if (unbound_constants_only) {
+    return unbound_constant_name_to_idx_.size();
+  } else {
+    return unbound_constant_name_to_idx_.size() +
+        bound_constant_name_to_idx_.size();
+  }
+}
+
+size_t ModelContainer::GetNumConstantFoldingInputs(
+    bool unbound_constants_only) const {
+  if (unbound_constants_only) {
+    return constant_folding_inputs_.size();
+  } else {
+    return constant_folding_inputs_.size() +
+        constant_folding_optional_inputs_.size();
+  }
+}
+
+void ModelContainer::WriteAllConstantNamesTo(
+    const char** constant_names_out,
+    bool unbound_constants_only,
+    bool constant_folding_inputs_only) const {
+  size_t num_to_write = constant_folding_inputs_only
+      ? GetNumConstants(unbound_constants_only)
+      : GetNumConstantFoldingInputs(unbound_constants_only);
+  if (constant_names_out == nullptr && num_to_write != 0) {
+    throw std::runtime_error("constant_names_out cannot be nullptr.");
+  }
+  size_t idx = 0;
+  for (auto& [name, _] : unbound_constant_name_to_idx_) {
+    if (!constant_folding_inputs_only ||
+        constant_folding_inputs_.find(name) != constant_folding_inputs_.end()) {
+      constant_names_out[idx++] = name.c_str();
+    }
+  }
+  if (!unbound_constants_only) {
+    for (auto& [name, _] : bound_constant_name_to_idx_) {
+      if (!constant_folding_inputs_only ||
+          constant_folding_optional_inputs_.find(name) !=
+              constant_folding_optional_inputs_.end()) {
+        constant_names_out[idx++] = name.c_str();
+      }
+    }
+  }
+}
+
 void ModelContainer::PrepareForRun(
     Model* model,
     const AITData* inputs,
@@ -353,13 +751,13 @@ void ModelContainer::PrepareForRun(
   }
   for (size_t i = 0; i < num_inputs_; ++i) {
     auto& input = inputs[i];
-    ValidateDtype(input.dtype, i);
+    ValidateParamDtype(input.dtype, i);
     model->SetInput(input.ptr, input.shape, i);
   }
 
   for (size_t i = 0; i < num_outputs_; ++i) {
     auto& output = outputs[i];
-    ValidateDtype(output.dtype, i + num_inputs_);
+    ValidateParamDtype(output.dtype, i + num_inputs_);
     model->SetOutput(output.ptr, i);
   }
 }
@@ -406,31 +804,28 @@ void ModelContainer::ReclaimFinishedModels(std::unique_lock<std::mutex>& lk) {
   available_models_.push_back(model);
 }
 
-void ModelContainer::ValidateDtype(AITemplateDtype dtype, size_t idx) const {
+void ModelContainer::ValidateParamDtype(AITemplateDtype dtype, size_t idx)
+    const {
   CHECK_VECTOR_ACCESS(param_dtypes_, idx)
   if (dtype != param_dtypes_[idx]) {
-    auto GetEnumString = [](auto dtype) {
-      switch (dtype) {
-        case AITemplateDtype::kUnset:
-          return "kUnset";
-        case AITemplateDtype::kHalf:
-          return "kHalf";
-        case AITemplateDtype::kFloat:
-          return "kFloat";
-        case AITemplateDtype::kInt:
-          return "kInt";
-        case AITemplateDtype::kLong:
-          return "kLong";
-        default:
-          return "unknown";
-      }
-    };
     throw std::runtime_error(
         "Got wrong dtype for param " + std::to_string(idx) + "; expected " +
         GetEnumString(param_dtypes_[idx]) + ", got " + GetEnumString(dtype));
   }
 }
 
+void ModelContainer::ValidateBoundConstantDtype(
+    AITemplateDtype dtype,
+    size_t idx) const {
+  CHECK_VECTOR_ACCESS(bound_constant_dtypes_, idx)
+  if (dtype != bound_constant_dtypes_[idx]) {
+    throw std::runtime_error(
+        "Got wrong dtype for param " + std::to_string(idx) + "; expected " +
+        GetEnumString(bound_constant_dtypes_[idx]) + ", got " +
+        GetEnumString(dtype));
+  }
+}
+
 float ModelContainer::BenchmarkImpl(
     const AITData* inputs,
     size_t num_inputs,
diff --git a/static/csrc/model_interface.cpp b/static/csrc/model_interface.cpp
index f980e1644..3a7460155 100644
--- a/static/csrc/model_interface.cpp
+++ b/static/csrc/model_interface.cpp
@@ -112,6 +112,78 @@ AITemplateError AITemplateModelContainerSetConstant(
   CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SetConstant(name, *tensor); })
 }
 
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
+    AITemplateModelHandle handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetManyConstants(names, tensors, num_tensors); })
+}
+
+AITemplateError AITemplateModelContainerSetDoubleBufferConstant(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char* name,
+    const AITData* tensor) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(tensor)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetDoubleBufferConstant(name, *tensor, stream); })
+}
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyDoubleBufferConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { m->SetManyDoubleBufferConstants(names, tensors, num_tensors, stream); })
+}
+
+AITemplateError AITemplateModelContainerGetNumConstants(
+    AITemplateModelHandle handle,
+    bool unbound_constants_only,
+    bool constant_folding_inputs_only,
+    size_t* num_constants_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(num_constants_out)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    if (constant_folding_inputs_only) {
+      *num_constants_out =
+          m->GetNumConstantFoldingInputs(unbound_constants_only);
+    } else {
+      *num_constants_out = m->GetNumConstants(unbound_constants_only);
+    }
+  })
+}
+
+AITemplateError AITemplateModelContainerGetConstantNames(
+    AITemplateModelHandle handle,
+    bool unbound_constants_only,
+    bool constant_folding_inputs_only,
+    const char** constant_names_out) {
+  RETURN_ERROR_IF_NULL(handle)
+  // WriteAllConstantNamesTo() will handle nullptr checks on constant_names_out.
+  // Passing nullptr is allowed if there are 0 constants!
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->WriteAllConstantNamesTo(
+        constant_names_out,
+        unbound_constants_only,
+        constant_folding_inputs_only);
+  })
+}
+
 AITemplateError AITemplateModelContainerRun(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -162,6 +234,25 @@ AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
   })
 }
 
+AITemplateError AITemplateModelContainerProfile(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    size_t num_iters,
+    const char* filename) {
+  RETURN_ERROR_IF_NULL(handle);
+  RETURN_ERROR_IF_NULL(filename);
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    m->Profile(
+        inputs, num_inputs, outputs, num_outputs, stream, num_iters, filename);
+  })
+}
+
 AITemplateError AITemplateModelContainerBenchmark(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -214,6 +305,26 @@ AITemplateError AITemplateModelContainerGetInputName(
       { *input_name_out = m->InputName(input_idx); })
 }
 
+AITemplateError AITemplateModelContainerGetMaximumInputShape(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateParamShape* shape) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(shape)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *shape = m->MaxInputShape(input_idx); })
+}
+
+AITemplateError AITemplateModelContainerGetInputDtype(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateDtype* input_dtype) {
+  RETURN_ERROR_IF_NULL(handle)
+  RETURN_ERROR_IF_NULL(input_dtype)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *input_dtype = m->InputDtype(input_idx); })
+}
+
 AITemplateError AITemplateModelContainerGetNumOutputs(
     AITemplateModelHandle handle,
     size_t* num_outputs_out) {
@@ -263,6 +374,33 @@ AITemplateError AITemplateModelContainerGetNumRuntimes(
   CONVERT_EXCEPTION_TO_ERROR_CODE({ *num_runtimes_out = m->GetNumRuntimes(); })
 }
 
+AITemplateError AITemplateModelContainerFoldConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync, false); })
+}
+
+AITemplateError AITemplateModelContainerFoldConstantsInDoubleBuffer(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  auto stream = reinterpret_cast<ait::StreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->FoldConstants(stream, sync, true); })
+}
+
+AITemplateError AITemplateModelContainerSwapConstants(
+    AITemplateModelHandle handle) {
+  RETURN_ERROR_IF_NULL(handle)
+  auto* m = reinterpret_cast<ait::ModelContainer*>(handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ m->SwapConstants(); })
+}
+
 AITemplateError AITemplateAllocatorCreate(
     AITemplateAllocator** allocator_out,
     AITemplateAllocatorType allocator_type) {
diff --git a/static/csrc/rocm_hack.cpp b/static/csrc/rocm_hack.cpp
deleted file mode 100644
index d92c48ed9..000000000
--- a/static/csrc/rocm_hack.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//  Copyright (c) Meta Platforms, Inc. and affiliates.
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//
-#include "library/include/ck/library/utility/device_memory.hpp"
-#include "library/include/ck/library/utility/host_tensor.hpp"
-#include "library/include/ck/library/utility/host_tensor_generator.hpp"
-
-// hack for DeviceMem linking error
-// TODO fix this by making CK a header-only lib
-// <<< hack begin
-DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) {
-  hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
-}
-void* DeviceMem::GetDeviceBuffer() const {
-  return mpDeviceBuf;
-}
-void DeviceMem::ToDevice(const void* p) const {
-  hipGetErrorString(hipMemcpy(
-      mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
-}
-void DeviceMem::FromDevice(void* p) const {
-  hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
-}
-DeviceMem::~DeviceMem() {
-  hipGetErrorString(hipFree(mpDeviceBuf));
-}
-struct KernelTimerImpl {
-  KernelTimerImpl() {
-    hipGetErrorString(hipEventCreate(&mStart));
-    hipGetErrorString(hipEventCreate(&mEnd));
-  }
-  ~KernelTimerImpl() {
-    hipGetErrorString(hipEventDestroy(mStart));
-    hipGetErrorString(hipEventDestroy(mEnd));
-  }
-  void Start() {
-    hipGetErrorString(hipDeviceSynchronize());
-    hipGetErrorString(hipEventRecord(mStart, nullptr));
-  }
-  void End() {
-    hipGetErrorString(hipEventRecord(mEnd, nullptr));
-    hipGetErrorString(hipEventSynchronize(mEnd));
-  }
-  float GetElapsedTime() const {
-    float time;
-    hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
-    return time;
-  }
-  hipEvent_t mStart, mEnd;
-};
-// >>> hack end
diff --git a/static/csrc/standalone.cpp b/static/csrc/standalone.cpp
new file mode 100644
index 000000000..cb486304d
--- /dev/null
+++ b/static/csrc/standalone.cpp
@@ -0,0 +1,787 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+// This file is used for generating a standalone executable for a model.
+// It only invokes the C++ model interface. We can directly invoke the
+// generated executable without going through Python bindings. Because it
+// aims for assisting debugging, we make a number of simplifications:
+//   * we use the maximum input shapes;
+//   * we only generate random inputs with a fixed seed;
+//   * we assume that outputs exist on the host;
+//   * we disable graph_mode;
+//   * etc...
+// Once the file is copied into the intemediate working dir (e.g.,
+// ./tmp/test_gemm_rcr) along with other files, users are free to make any
+// changes to the code. We do not try to predict users' actions.
+
+#include <cstddef>
+#include <cstdint>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "macros.h"
+#include "model_interface.h"
+#include "raii_wrapper.h"
+
+using namespace ait;
+
+template <typename T>
+static void make_random_integer_values(
+    std::mt19937& rnd_generator,
+    T* h_data,
+    size_t numel,
+    T lb,
+    T ub) {
+  std::uniform_int_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    h_data[i] = static_cast<T>(dist(rnd_generator));
+  }
+}
+
+static void make_random_float_values(
+    std::mt19937& rnd_generator,
+    float* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    h_data[i] = static_cast<float>(dist(rnd_generator));
+  }
+}
+
+static void make_random_float16_values(
+    std::mt19937& rnd_generator,
+    half* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    float v = static_cast<float>(dist(rnd_generator));
+    h_data[i] = __float2half_rn(v);
+  }
+}
+
+static void make_random_bfloat16_values(
+    std::mt19937& rnd_generator,
+    bfloat16* h_data,
+    size_t numel,
+    float lb,
+    float ub) {
+  std::uniform_real_distribution<> dist(lb, ub);
+  for (size_t i = 0; i < numel; i++) {
+    float v = static_cast<float>(dist(rnd_generator));
+    h_data[i] = __float2bfloat16_rn(v);
+  }
+}
+
+static GPUPtr make_random_data(
+    AITemplateAllocator& allocator,
+    std::mt19937& rnd_generator,
+    const AITemplateParamShape& shape,
+    const AITemplateDtype& dtype) {
+  size_t numel = shape.Numel();
+  size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+  void* h_data;
+  DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+  switch (dtype) {
+    case AITemplateDtype::kInt:
+      make_random_integer_values<int>(
+          rnd_generator,
+          static_cast<int*>(h_data),
+          numel,
+          /*lb*/ -10,
+          /*ub*/ 10);
+      break;
+    case AITemplateDtype::kLong:
+      make_random_integer_values<int64_t>(
+          rnd_generator,
+          static_cast<int64_t*>(h_data),
+          numel,
+          /*lb*/ -10,
+          /*ub*/ 10);
+      break;
+    case AITemplateDtype::kFloat:
+      make_random_float_values(
+          rnd_generator,
+          static_cast<float*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kBFloat16:
+      make_random_bfloat16_values(
+          rnd_generator,
+          static_cast<bfloat16*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kHalf:
+      make_random_float16_values(
+          rnd_generator,
+          static_cast<half*>(h_data),
+          numel,
+          /*lb*/ 1.0,
+          /*ub*/ 2.0);
+      break;
+    case AITemplateDtype::kBool:
+      make_random_integer_values<bool>(
+          rnd_generator, static_cast<bool*>(h_data), numel, /*lb*/ 0, /*ub*/ 1);
+      break;
+    default:
+      throw std::runtime_error("unsupported dtype for making random data");
+  }
+
+  GPUPtr d_ptr = RAII_DeviceMalloc(num_bytes, allocator);
+  DEVICE_CHECK(CopyToDevice(d_ptr.get(), h_data, num_bytes));
+
+  // free memory
+  DEVICE_CHECK(FreeDeviceHostMemory(h_data));
+
+  return d_ptr;
+}
+
+using OutputDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
+
+struct OutputData {
+  OutputData(
+      OutputDataPtr& data_in,
+      std::unique_ptr<int64_t[]>& shape_ptr_in,
+      int shape_size_in,
+      int index_in,
+      AITemplateDtype dtype_in,
+      const char* name_in)
+      : data(std::move(data_in)),
+        shape_ptr(std::move(shape_ptr_in)),
+        shape_size(shape_size_in),
+        index(index_in),
+        dtype(dtype_in),
+        name(name_in) {}
+
+  OutputData(OutputData&& other) noexcept
+      : data(std::move(other.data)),
+        shape_ptr(std::move(other.shape_ptr)),
+        shape_size(other.shape_size),
+        index(other.index),
+        dtype(other.dtype),
+        name(std::move(other.name)) {}
+
+  OutputDataPtr data;
+  std::unique_ptr<int64_t[]> shape_ptr;
+  int shape_size;
+  int index;
+  AITemplateDtype dtype;
+  std::string name;
+};
+
+static AITemplateError run(
+    AITemplateModelHandle handle,
+    AITemplateAllocator& allocator,
+    std::vector<OutputData>& outputs) {
+  size_t num_outputs = 0;
+  AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+
+  outputs.reserve(num_outputs);
+  std::vector<AITData> ait_outputs;
+  ait_outputs.reserve(num_outputs);
+  std::vector<int64_t*> ait_output_shapes_out;
+  ait_output_shapes_out.reserve(num_outputs);
+
+  for (unsigned i = 0; i < num_outputs; i++) {
+    const char* name;
+    AITemplateModelContainerGetOutputName(handle, i, &name);
+    AITemplateParamShape shape;
+    AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+    AITemplateDtype dtype;
+    AITemplateModelContainerGetOutputDtype(handle, i, &dtype);
+
+    std::unique_ptr<int64_t[]> shape_ptr =
+        std::make_unique<int64_t[]>(shape.size);
+    ait_output_shapes_out.push_back(shape_ptr.get());
+    size_t num_bytes = shape.Numel() * AITemplateDtypeSizeBytes(dtype);
+    void* h_data;
+    DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+    ait_outputs.emplace_back(h_data, shape, dtype);
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    OutputDataPtr h_output_ptr(h_data, deleter);
+    outputs.emplace_back(
+        h_output_ptr, shape_ptr, (int)shape.size, (int)i, dtype, name);
+  }
+
+  size_t num_inputs = 0;
+  AITemplateModelContainerGetNumInputs(handle, &num_inputs);
+  // Holding unique_ptr(s) that will be auto-released.
+  std::vector<GPUPtr> input_ptrs;
+  input_ptrs.reserve(num_inputs);
+
+  std::map<std::string, unsigned> input_name_to_index;
+  std::vector<AITData> inputs(num_inputs);
+  std::mt19937 rnd_generator(1234);
+  // set up the name-to-index map each input
+  for (unsigned i = 0; i < num_inputs; i++) {
+    const char* name;
+    AITemplateModelContainerGetInputName(handle, i, &name);
+    input_name_to_index.insert({name, i});
+    std::cout << "input: " << name << ", at idx: " << i << "\n";
+
+    AITemplateParamShape shape;
+    AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+    AITemplateDtype dtype;
+    AITemplateModelContainerGetInputDtype(handle, i, &dtype);
+    // This file aims for helping debugging so we make the code logic
+    // simple. Instead of asking the user to pass input names along with
+    // shapes, we just use the shape with the largest dimension values
+    // to make a random input. Once this code is copied into the test's
+    // tmp folder, the person who will be diagnosing the issue could make any
+    // changes to the code. We don't force us to predict the user's behavior.
+    input_ptrs.emplace_back(
+        make_random_data(allocator, rnd_generator, shape, dtype));
+    inputs[i] = AITData(input_ptrs.back().get(), shape, dtype);
+  }
+
+  bool graph_mode = false;
+  auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+  return AITemplateModelContainerRunWithOutputsOnHost(
+      handle,
+      inputs.data(),
+      num_inputs,
+      ait_outputs.data(),
+      num_outputs,
+      reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+      graph_mode,
+      ait_output_shapes_out.data());
+}
+
+template <typename T>
+void read_element(std::ifstream& fh, T& elem) {
+  if (!fh.good()) {
+    throw std::runtime_error("Input stream is not in good state.");
+  }
+  fh.read(reinterpret_cast<char*>(&elem), sizeof(T));
+  if (fh.fail()) {
+    throw std::runtime_error("Failed to read binary data");
+  }
+}
+
+struct AITStandaloneTestcase {
+  std::vector<AITData> expected_outputs;
+  std::vector<AITData> host_outputs;
+  std::vector<AITData> gpu_outputs;
+
+  std::vector<int64_t*> ait_output_shapes_out;
+
+  std::vector<AITData>
+      inputs; // this will be filled the AITData instances for the inputs
+
+  std::vector<int64_t> shape_data_owner;
+  std::vector<GPUPtr> gpu_data_owner;
+
+  const std::string test_data_path; // path to test data file
+  AITemplateModelHandle& handle;
+  AITemplateAllocator& allocator;
+
+  float atol;
+  float rtol;
+
+  AITStandaloneTestcase(
+      const char* test_data_path_,
+      AITemplateModelHandle& handle_, // model handle
+      AITemplateAllocator& allocator_)
+      : handle(handle_),
+        allocator(allocator_),
+        test_data_path(test_data_path_) {
+    _load();
+  }
+
+  void _load() { // relative error tolerance
+    size_t num_outputs = 0;
+    size_t num_inputs = 0;
+    AITemplateModelContainerGetNumInputs(handle, &num_inputs);
+    AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+    ait_output_shapes_out.reserve(num_outputs);
+    expected_outputs.reserve(num_outputs);
+    host_outputs.reserve(num_outputs);
+    gpu_outputs.reserve(num_outputs);
+    std::ifstream fh(test_data_path);
+    read_element(fh, atol); // absolute error tolerance
+    read_element(fh, rtol); // relative error tolerance
+
+    gpu_data_owner.reserve(num_inputs + num_outputs);
+    ait_output_shapes_out.reserve(num_outputs);
+
+    std::map<std::string, unsigned> input_name_to_index;
+    size_t total_dim_count =
+        0; // the sum of shape.ndims for all input and output tensors
+    // calculate total_dim_count
+    for (unsigned i = 0; i < num_inputs; i++) {
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+      total_dim_count += shape.size;
+    }
+    for (unsigned i = 0; i < num_outputs; i++) {
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+      total_dim_count += shape.size * 2; // allocation required twice
+    }
+    // this is just a vector that owns the memory for the shape.shape_data
+    // values
+    shape_data_owner.reserve(total_dim_count);
+    size_t shape_offset = 0; // offset into the shape_data_owner array
+    for (unsigned i = 0; i < num_inputs; i++) {
+      // for each input tensor
+      const char* name;
+      AITemplateModelContainerGetInputName(handle, i, &name);
+      AITemplateDtype dtype;
+      AITemplateModelContainerGetInputDtype(handle, i, &dtype);
+      size_t dtype_size = AITemplateDtypeSizeBytes(dtype);
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumInputShape(handle, i, &shape);
+
+      input_name_to_index.insert({name, i});
+      std::cout << "Loading input: " << name << ", at idx: " << i;
+
+      // Read metadata for test case
+      unsigned int read_dtype;
+      unsigned int read_dtype_size;
+      unsigned int read_ndims;
+      size_t read_total_tensor_bytes;
+      read_element(fh, read_dtype);
+      std::cout << ", dtype=" << read_dtype;
+      read_element(fh, read_dtype_size);
+      std::cout << ", sizeof(dtype)=" << read_dtype_size;
+      read_element(fh, read_ndims);
+      std::cout << ", ndims=" << read_ndims;
+
+      if (static_cast<AITemplateDtype>(read_dtype) != dtype) {
+        throw std::runtime_error(
+            "Mismatch between dtype of input in testcase data and in model");
+      }
+
+      if (dtype_size != static_cast<size_t>(read_dtype_size)) {
+        throw std::runtime_error(
+            "Mismatch between sizeof(dtype) in testcase data and in model");
+      }
+
+      // Obtain maximum shape from model and verify the testcase data has valid
+      // shape
+      if (read_ndims != shape.size) {
+        throw std::runtime_error(
+            "Mismatch between number of input dimensions in testcase data and in model");
+      }
+      std::cout << ", shape=(";
+      for (unsigned j = 0; j < read_ndims; j++) {
+        size_t dim;
+        read_element(fh, dim);
+        shape_data_owner.push_back(dim);
+        std::cout << dim << ", ";
+        if (dim > shape.shape_data[j]) {
+          throw std::runtime_error(
+              "Shape in testcase data exceeds maximum shape.");
+        }
+      }
+      std::cout << ")";
+
+      // Set the shape of the input to the actual, and not the maximum shape.
+      // the previous shape.shape_data may not be deleted as it's owned by the
+      // model.
+      shape.shape_data = shape_data_owner.data() + shape_offset;
+      shape_offset += read_ndims; // move offset to the next unused space
+
+      // total number of bytes of tensor raw data
+      read_element(fh, read_total_tensor_bytes);
+
+      size_t numel = shape.Numel();
+      size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+      std::cout << ", total_tensor_bytes=" << read_total_tensor_bytes
+                << " - model expects " << num_bytes << "\n";
+      if (num_bytes != read_total_tensor_bytes) {
+        throw std::runtime_error("Tensor data total size mismatch.");
+      }
+      // allocate memory for tensor raw data on host
+      void* h_data;
+      DEVICE_CHECK(DeviceMallocHost(&h_data, num_bytes));
+      // read tensor raw data from file
+      fh.read(reinterpret_cast<char*>(h_data), read_total_tensor_bytes);
+      // Allocate corresponding device memory and copy tensor raw data to device
+      gpu_data_owner.emplace_back(RAII_DeviceMalloc(num_bytes, allocator));
+      DEVICE_CHECK(
+          CopyToDevice(gpu_data_owner.back().get(), h_data, num_bytes));
+
+      // free host memory for tensor
+      DEVICE_CHECK(FreeDeviceHostMemory(h_data));
+
+      inputs.push_back(AITData(gpu_data_owner.back().get(), shape, dtype));
+    }
+    std::cout << "Finished loading testcase inputs."
+              << "\n";
+    if (fh.peek() == std::ifstream::traits_type::eof()) {
+      std::cout << "No expected outputs in testcase."
+                << "\n";
+      return;
+    }
+    if (inputs.size() != num_inputs) {
+      throw std::runtime_error("Number of inputs mismatches with expected.");
+    }
+    // read expected outputs from file
+    for (unsigned i = 0; i < num_outputs; i++) {
+      // for each input tensor
+      const char* name;
+      AITemplateModelContainerGetOutputName(handle, i, &name);
+      AITemplateDtype dtype;
+      AITemplateModelContainerGetOutputDtype(handle, i, &dtype);
+      size_t dtype_size = AITemplateDtypeSizeBytes(dtype);
+      AITemplateParamShape shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &shape);
+      AITemplateParamShape max_shape;
+      AITemplateModelContainerGetMaximumOutputShape(handle, i, &max_shape);
+
+      size_t max_numel = shape.Numel();
+      size_t max_num_bytes = max_numel * AITemplateDtypeSizeBytes(dtype);
+
+      gpu_data_owner.emplace_back(RAII_DeviceMalloc(max_num_bytes, allocator));
+      gpu_outputs.push_back(
+          AITData(gpu_data_owner.back().get(), max_shape, dtype));
+
+      std::cout << "Loading expected output: " << name << ", at idx: " << i;
+
+      // Read metadata for test case
+      unsigned int read_dtype;
+      unsigned int read_dtype_size;
+      unsigned int read_ndims;
+      size_t read_total_tensor_bytes;
+      read_element(fh, read_dtype);
+      std::cout << ", dtype=" << read_dtype;
+      read_element(fh, read_dtype_size);
+      std::cout << ", sizeof(dtype)=" << read_dtype_size;
+      read_element(fh, read_ndims);
+      std::cout << ", ndims=" << read_ndims;
+
+      if (static_cast<AITemplateDtype>(read_dtype) != dtype) {
+        throw std::runtime_error(
+            "Mismatch between dtype of input in testcase data and in model");
+      }
+
+      if (dtype_size != static_cast<size_t>(read_dtype_size)) {
+        throw std::runtime_error(
+            "Mismatch between sizeof(dtype) in testcase data and in model");
+      }
+
+      // Obtain maximum shape from model and verify the testcase data has valid
+      // shape
+      if (read_ndims != shape.size) {
+        throw std::runtime_error(
+            "Mismatch between number of input dimensions in testcase data and in model");
+      }
+      std::cout << ", shape=(";
+      for (unsigned j = 0; j < read_ndims; j++) {
+        size_t dim;
+        read_element(fh, dim);
+        shape_data_owner.push_back(dim);
+        std::cout << dim << ", ";
+        if (dim > shape.shape_data[j]) {
+          throw std::runtime_error(
+              "Shape in testcase data exceeds maximum shape.");
+        }
+      }
+      std::cout << ")";
+
+      // Set the shape of the input to the actual, and not the maximum shape.
+      // the previous shape.shape_data may not be deleted as it's owned by the
+      // model.
+      shape.shape_data = shape_data_owner.data() + shape_offset;
+      shape_offset += read_ndims; // move offset to the next unused space
+
+      // total number of bytes of tensor raw data
+      read_element(fh, read_total_tensor_bytes);
+
+      size_t numel = shape.Numel();
+      size_t num_bytes = numel * AITemplateDtypeSizeBytes(dtype);
+      std::cout << ", total_tensor_bytes=" << read_total_tensor_bytes
+                << " - model expects " << num_bytes << "\n";
+      if (num_bytes != read_total_tensor_bytes) {
+        throw std::runtime_error("Tensor data total size mismatch.");
+      }
+      // allocate memory for tensor raw data on host
+      void* h_data_expected;
+      void* h_data;
+      DEVICE_CHECK(
+          DeviceMallocHost(&h_data, max_num_bytes)); // max size required here
+      DEVICE_CHECK(DeviceMallocHost(&h_data_expected, num_bytes));
+
+      // read tensor raw data from file
+      fh.read(
+          reinterpret_cast<char*>(h_data_expected), read_total_tensor_bytes);
+
+      // ---
+      // Memory to place output tensors on host
+      host_outputs.emplace_back(h_data, shape, dtype);
+      ait_output_shapes_out.push_back(shape_data_owner.data());
+      shape_offset += read_ndims;
+      expected_outputs.emplace_back(h_data_expected, shape, dtype);
+    }
+  }
+
+  AITemplateError run(
+      AITemplateModelHandle handle,
+      AITemplateAllocator& allocator) {
+    bool graph_mode = false;
+    auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+
+    return AITemplateModelContainerRunWithOutputsOnHost(
+        handle,
+        inputs.data(),
+        inputs.size(),
+        host_outputs.data(),
+        host_outputs.size(),
+        reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+        graph_mode,
+        ait_output_shapes_out.data());
+  }
+
+  float benchmark(
+      AITemplateModelHandle handle,
+      AITemplateAllocator& allocator,
+      size_t count,
+      size_t num_threads) {
+    bool graph_mode = false;
+    auto stream = RAII_StreamCreate(/*non_blocking=*/true);
+    float runtime_ms = -999.0f;
+    AITemplateError err = AITemplateModelContainerBenchmark(
+        handle,
+        inputs.data(),
+        inputs.size(),
+        gpu_outputs.data(),
+        gpu_outputs.size(),
+        reinterpret_cast<AITemplateStreamHandle>(stream.get()),
+        graph_mode,
+        count,
+        num_threads,
+        true,
+        &runtime_ms,
+        ait_output_shapes_out.data());
+    if (err != AITemplateError::AITemplateSuccess) {
+      std::cout << "Benchmark failed with error " << static_cast<int>(err)
+                << std::endl;
+      return -1.0f;
+    }
+    return runtime_ms;
+  }
+
+  bool compare_results_to_expected() {
+    bool passed = true;
+    size_t num_outputs = 0;
+    AITemplateModelContainerGetNumOutputs(handle, &num_outputs);
+    for (unsigned output_idx = 0; output_idx < num_outputs; ++output_idx) {
+      switch (expected_outputs[output_idx].dtype) {
+        case AITemplateDtype::kInt:
+          passed = passed and _compare_results_to_expected<int32_t>(output_idx);
+          break;
+        case AITemplateDtype::kLong:
+          passed = passed and _compare_results_to_expected<int64_t>(output_idx);
+          break;
+        case AITemplateDtype::kFloat:
+          passed = passed and _compare_results_to_expected<float>(output_idx);
+          break;
+        case AITemplateDtype::kBFloat16:
+          passed =
+              passed and _compare_results_to_expected<bfloat16>(output_idx);
+          break;
+        case AITemplateDtype::kHalf:
+          passed = passed and _compare_results_to_expected<half>(output_idx);
+          break;
+        case AITemplateDtype::kBool:
+          passed = passed and _compare_results_to_expected<bool>(output_idx);
+          break;
+        default:
+          std::cerr << "Unsupported output dtype! "
+                    << static_cast<int>(expected_outputs[output_idx].dtype)
+                    << std::endl;
+          throw std::runtime_error("unsupported dtype for comparisons");
+      }
+    }
+    return passed;
+  }
+
+  template <typename T>
+  bool _compare_results_to_expected(unsigned output_idx) {
+    unsigned ndims = host_outputs[output_idx].shape.size;
+    // check the actual output shape
+    for (unsigned i = 0; i < ndims; ++i) {
+      if (expected_outputs[output_idx].shape.shape_data[i] !=
+          ait_output_shapes_out[output_idx][i]) {
+        std::cout
+            << "Mismatch between expected output shape and actual shape after inference of output #"
+            << i << " at dimension " << i << " expected shape[i]=="
+            << host_outputs[output_idx].shape.shape_data[i]
+            << " actual shape[i]==" << ait_output_shapes_out[output_idx][i]
+            << std::endl;
+        return false;
+      }
+    }
+    size_t numel = host_outputs[output_idx].shape.Numel();
+    T* data = reinterpret_cast<T*>(host_outputs[output_idx].ptr);
+    T* expected_data = reinterpret_cast<T*>(expected_outputs[output_idx].ptr);
+    size_t violations = 0;
+    int worst_idx = -1;
+    double worst_abs_diff = 0.0;
+
+    for (size_t i = 0; i < numel; ++i) {
+      double val = static_cast<double>(data[i]);
+      double expected = static_cast<double>(expected_data[i]);
+      double actual_diff = std::abs(val - expected);
+      double tolerated_diff = atol +
+          rtol * std::abs(expected); // as defined by torch.testing.assert_close
+      if (actual_diff > worst_abs_diff) {
+        worst_abs_diff = actual_diff;
+      }
+      if (actual_diff > tolerated_diff) {
+        violations++;
+      }
+    }
+    if (violations > 0) {
+      std::cout
+          << "Actual output and expected output are not equal for output with index "
+          << output_idx << " of " << numel << " elements, " << violations
+          << " differed by more than the tolerance of atol=" << atol
+          << " and rtol=" << rtol << rtol << "\n";
+      return false;
+    }
+    return true;
+  }
+};
+
+int run_testcase(const char* input_file, bool benchmark) {
+  std::cout << "Starting single test run with input " << input_file << "\n";
+  {
+    AITemplateModelHandle handle;
+    AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+    AITemplateAllocator* allocator;
+    AIT_ERROR_CHECK(AITemplateAllocatorCreate(
+        &allocator, AITemplateAllocatorType::kDefault));
+
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    AITStandaloneTestcase test(input_file, handle, *allocator);
+
+    AIT_ERROR_CHECK(test.run(handle, *allocator));
+    std::cout << "Finished test run with input " << input_file << "\n";
+    int retval = -1;
+    if (!test.compare_results_to_expected()) {
+      std::cout << "Test failed. " << std::endl;
+      return 1;
+    }
+    std::cout << "Test succeeded. " << std::endl;
+  }
+  if (benchmark) {
+    std::cout << "Benchmarking with testcase " << input_file << "\n";
+    AITemplateModelHandle handle;
+    AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+    AITemplateAllocator* allocator;
+    AIT_ERROR_CHECK(AITemplateAllocatorCreate(
+        &allocator, AITemplateAllocatorType::kDefault));
+
+    auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+    AITStandaloneTestcase benchmarker(input_file, handle, *allocator);
+    float runtime_ms = benchmarker.benchmark(handle, *allocator, 10, 1);
+    if (runtime_ms >= 0.0) {
+      std::cout << "Benchmark result: " << input_file
+                << " repetitions: 10, ms/iter: " << runtime_ms << "\n";
+    }
+  }
+
+  return 0;
+}
+
+int run_with_random_inputs() {
+  AITemplateModelHandle handle;
+  AITemplateModelContainerCreate(&handle, /*num_runtimes*/ 1);
+  AITemplateAllocator* allocator;
+  AIT_ERROR_CHECK(
+      AITemplateAllocatorCreate(&allocator, AITemplateAllocatorType::kDefault));
+
+  auto deleter = [](void* data) { FreeDeviceHostMemory(data); };
+
+  std::vector<OutputData> outputs;
+  AIT_ERROR_CHECK(run(handle, *allocator, outputs));
+
+  // print out something
+  for (const auto& output : outputs) {
+    std::cout << "output: " << output.name << " at idx: " << output.index
+              << " with shape: ";
+    for (int i = 0; i < output.shape_size; i++) {
+      std::cout << output.shape_ptr[i] << ",";
+    }
+    std::cout << "\n";
+  }
+
+  AIT_ERROR_CHECK(AITemplateAllocatorDelete(allocator));
+  // We are done and delete the handle.
+  AITemplateModelContainerDelete(handle);
+  return 0;
+}
+
+int main(int argc, char* argv[]) {
+  try {
+    if (argc <= 1) {
+      std::cout
+          << "No action provided on commandline. Running model with random maximum size inputs."
+          << std::endl;
+
+      return run_with_random_inputs();
+    }
+    std::string action(argv[1]);
+    if ((action == "--help") or (action == "help")) {
+      std::cout << "AITemplate standalone test runner usage:" << std::endl
+                << " run with random input:   " << argv[0] << std::endl
+                << " run single tests:        " << argv[0]
+                << " test <testcase-file-1> ... <testcase-file-N>" << std::endl
+                << " run tests and benchmark: " << argv[0]
+                << " benchmark <testcase-file-1> ... <testcase-file-N>"
+                << std::endl;
+    }
+    if ((action == "test") or (action == "benchmark")) {
+      if (argc < 3) {
+        std::cout
+            << "Invalid number of arguments. Require at least one test case as argument"
+            << std::endl;
+      }
+      int failure_count = 0;
+      for (int i = 2; i < argc; i++) {
+        if (run_testcase(argv[i], action == "benchmark") != 0) {
+          failure_count++;
+        }
+      }
+      if (failure_count == 0) {
+        std::cout << "All tests succeeded." << std::endl;
+      } else {
+        std::cout << "Failed tests: " << failure_count << " of " << (argc - 2)
+                  << std::endl;
+      }
+      return failure_count;
+    }
+    return 0;
+  } catch (const std::exception& e) {
+    std::cerr << "Exception caught: " << e.what() << std::endl;
+    return -99;
+  }
+}
diff --git a/static/csrc/windll.cpp b/static/csrc/windll.cpp
new file mode 100644
index 000000000..5e0e19cf9
--- /dev/null
+++ b/static/csrc/windll.cpp
@@ -0,0 +1,70 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#include <stdexcept>
+#include <string>
+
+#include <windows.h>
+
+HMODULE SavedDllHandle;
+
+BOOL WINAPI DllMain(
+    HINSTANCE hinstDLL, // handle to DLL module
+    DWORD fdwReason, // reason for calling function
+    LPVOID lpvReserved) // reserved
+{
+  switch (fdwReason) {
+    case DLL_PROCESS_ATTACH:
+      SavedDllHandle = hinstDLL;
+      break;
+  }
+  return TRUE;
+}
+
+namespace ait {
+
+#define TRIGGER_ERROR(message)                        \
+  throw std::runtime_error(                           \
+      (message) + " at file " + __FILE__ + ", line" + \
+      std::to_string(__LINE__));
+
+void GetConstantsBin(void** address, size_t* size) {
+  HRSRC hResource = FindResource(SavedDllHandle, "constant_bin", "CUSTOMDATA");
+  if (!hResource) {
+    // Could not find a resource. Return zero values, because
+    // linker won't include empty constant.bin file. So, this is an
+    // expected behavior.
+    *size = 0;
+    *address = nullptr;
+    return;
+  }
+
+  HGLOBAL hResourceData = LoadResource(SavedDllHandle, hResource);
+  if (!hResourceData) {
+    // could not load a resource
+    auto errorCode = GetLastError();
+    TRIGGER_ERROR(std::string(
+        "LoadResource() call in GetConstantsBin() has failed with error " +
+        std::to_string(errorCode)));
+  }
+
+  DWORD resourceSize = SizeofResource(SavedDllHandle, hResource);
+  void* resourceData = LockResource(hResourceData);
+
+  *size = resourceSize;
+  *address = resourceData;
+}
+
+} // namespace ait
diff --git a/static/include/cuda_device_functions.h b/static/include/cuda_device_functions.h
index bb377a8bd..4d2c3f463 100644
--- a/static/include/cuda_device_functions.h
+++ b/static/include/cuda_device_functions.h
@@ -14,6 +14,7 @@
 //
 #pragma once
 
+#include <sstream>
 #include <string>
 
 #include "cutlass/conv/conv2d_problem_size.h"
@@ -23,8 +24,12 @@
 #include "cutlass/util/host_tensor.h"
 #include "cutlass/util/reference/host/tensor_fill.h"
 
+#include <nvtx3/nvToolsExt.h>
+
 namespace ait {
 
+inline thread_local bool target_has_graph_mode = true;
+
 using DeviceError = cudaError_t;
 using DevicePropertyType = cudaDeviceProp;
 using StreamType = cudaStream_t;
@@ -33,6 +38,8 @@ using GraphType = cudaGraph_t;
 using GraphExecType = cudaGraphExec_t;
 using Handle = void*;
 
+using bfloat16 = __nv_bfloat16;
+
 inline DeviceError GetDevice(int* device_idx) {
   return cudaGetDevice(device_idx);
 }
@@ -43,6 +50,214 @@ inline DeviceError GetDeviceProperties(
   return cudaGetDeviceProperties(prop, device_idx);
 }
 
+inline std::string GetUUIDToString(const char bytes[16]) {
+  std::vector<std::tuple<int, int>> groups = {
+      {0, 4}, {4, 6}, {6, 8}, {8, 10}, {10, 16}};
+  char const hex_chars[16] = {
+      '0',
+      '1',
+      '2',
+      '3',
+      '4',
+      '5',
+      '6',
+      '7',
+      '8',
+      '9',
+      'a',
+      'b',
+      'c',
+      'd',
+      'e',
+      'f'};
+
+  std::string result = "GPU";
+  for (auto g : groups) {
+    result += "-";
+    for (size_t i = std::get<0>(g); i < std::get<1>(g); ++i) {
+      result += hex_chars[(bytes[i] & 0xF0) >> 4];
+      result += hex_chars[(bytes[i] & 0x0F)];
+    }
+  }
+  return result;
+}
+
+inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
+      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
+      << prop.multiGpuBoardGroupID
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n     PCI domain ID of the device: " << prop.pciDomainID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Device's maximum L2 persisting lines capacity in bytes: "
+      << prop.persistingL2CacheMaxSize
+      << "\n     Shared memory reserved by CUDA driver per block in bytes: "
+      << prop.reservedSharedMemPerBlock
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Per device maximum shared memory per block usable by special opt in: "
+      << prop.sharedMemPerBlockOptin
+      << "\n     Shared memory available per multiprocessor in bytes: "
+      << prop.sharedMemPerMultiprocessor
+      << "\n     The maximum value of cudaAccessPolicyWindow::num_bytes: "
+      << prop.accessPolicyMaxWindowSize
+      << "\n     Max global memory clock frequency in khz: "
+      << prop.memoryClockRate
+      << "\n     Peak global memory bandwidth (GByte/s): "
+      << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
+
+      << "\n  Thread limits: "
+      << "\n     Warp size in threads: " << prop.warpSize
+      << "\n     Maximum size of each dimension of a grid: "
+      << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
+      << prop.maxGridSize[2]
+      << "\n     Maximum size of each dimension of a block: "
+      << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
+      << prop.maxThreadsDim[2]
+      << "\n     Number of asynchronous engines: " << prop.asyncEngineCount
+      << "\n     Maximum number of resident blocks per multiprocessor: "
+      << prop.maxBlocksPerMultiProcessor
+      << "\n     Maximum number of threads per block: "
+      << prop.maxThreadsPerBlock
+      << "\n     Maximum resident threads per multiprocessor: "
+      << prop.maxThreadsPerMultiProcessor
+      << "\n     Maximum pitch in bytes allowed by memory copies: "
+      << prop.memPitch << "\n     Number of multiprocessors on device: "
+      << prop.multiProcessorCount
+      << "\n     32-bit registers available per block: " << prop.regsPerBlock
+      << "\n     32-bit registers available per multiprocessor: "
+      << prop.regsPerMultiprocessor
+      << "\n     Max clock frequency of the multiProcessors in khz: "
+      << prop.clockRate
+
+      << "\n  Device features: "
+      << "\n     Device has ECC support enabled: "
+      << (prop.ECCEnabled ? "yes" : "no")
+      << "\n     Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer: "
+      << (prop.canMapHostMemory ? "yes" : "no")
+      << "\n     Device can access host registered memory at the same virtual address as the CPU: "
+      << (prop.canUseHostPointerForRegisteredMem ? "yes" : "no")
+      << "\n     Device supports Compute Preemption: "
+      << (prop.computePreemptionSupported ? "yes" : "no")
+      << "\n     Device can possibly execute multiple kernels concurrently: "
+      << (prop.concurrentKernels ? "yes" : "no")
+      << "\n     Device can coherently access managed memory concurrently with the CPU: "
+      << (prop.concurrentManagedAccess ? "yes" : "no")
+      << "\n     Device supports launching cooperative kernels via cudaLaunchCooperativeKernel: "
+      << (prop.cooperativeLaunch ? "yes" : "no")
+      << "\n     Host can directly access managed memory on the device without migration: "
+      << (prop.directManagedMemAccessFromHost ? "yes" : "no")
+      << "\n     Device supports caching globals in L1: "
+      << (prop.globalL1CacheSupported ? "yes" : "no")
+      << "\n     Link between the device and the host supports native atomic operations: "
+      << (prop.hostNativeAtomicSupported ? "yes" : "no")
+      << "\n     Device is integrated as opposed to discrete: "
+      << (prop.integrated ? "yes" : "no")
+      << "\n     Device is on a multi-GPU board: "
+      << (prop.isMultiGpuBoard ? "yes" : "no")
+      << "\n     Device supports caching locals in L1: "
+      << (prop.localL1CacheSupported ? "yes" : "no")
+      << "\n     Device supports allocating managed memory on this system: "
+      << (prop.managedMemory ? "yes" : "no")
+      << "\n     Device supports coherently accessing pageable memory without calling cudaHostRegister on it: "
+      << (prop.pageableMemoryAccess ? "yes" : "no")
+      << "\n     Device accesses pageable memory via the host's page tables: "
+      << (prop.pageableMemoryAccessUsesHostPageTables ? "yes" : "no")
+      << "\n     Device supports stream priorities: "
+      << (prop.streamPrioritiesSupported ? "yes" : "no")
+      << "\n     Device is a Tesla device using TCC driver: "
+      << (prop.tccDriver ? "yes" : "no")
+      << "\n     Device shares a unified address space with the host: "
+      << (prop.unifiedAddressing ? "yes" : "no")
+
+      << "\n  Texture limits: "
+      << "\n     Maximum 1D surface size: " << prop.maxSurface1D
+      << "\n     Maximum 1D layered surface dimensions: "
+      << prop.maxSurface1DLayered[0] << " " << prop.maxSurface1DLayered[1]
+      << "\n     Maximum 2D surface dimensions: " << prop.maxSurface2D[0] << " "
+      << prop.maxSurface2D[1]
+      << "\n     Maximum 2D layered surface dimensions: "
+      << prop.maxSurface2DLayered[0] << " " << prop.maxSurface2DLayered[1]
+      << " " << prop.maxSurface2DLayered[2]
+      << "\n     Maximum 3D surface dimensions: " << prop.maxSurface3D[0] << " "
+      << prop.maxSurface3D[1] << " " << prop.maxSurface3D[2]
+      << "\n     Maximum Cubemap surface dimensions: " << prop.maxSurfaceCubemap
+      << "\n     Maximum Cubemap layered surface dimensions: "
+      << prop.maxSurfaceCubemapLayered[0] << " "
+      << prop.maxSurfaceCubemapLayered[1]
+      << "\n     Maximum 1D texture size: " << prop.maxTexture1D
+      << "\n     Maximum 1D layered texture dimensions "
+      << prop.maxTexture1DLayered[0] << " " << prop.maxTexture1DLayered[1]
+      << "\n     Maximum 1D mipmapped texture size: " << prop.maxTexture1DMipmap
+      << "\n     Maximum 2D texture dimensions: " << prop.maxTexture2D[0] << " "
+      << prop.maxTexture2D[1]
+      << "\n     Maximum 2D texture dimensions if texture gather operations have to be performed: "
+      << prop.maxTexture2DGather[0] << " " << prop.maxTexture2DGather[1]
+      << "\n     Maximum 2D layered texture dimensions: "
+      << prop.maxTexture2DLayered[0] << " " << prop.maxTexture2DLayered[1]
+      << " " << prop.maxTexture2DLayered[2]
+      << "\n     Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory: "
+      << prop.maxTexture2DLinear[0] << " " << prop.maxTexture2DLinear[1] << " "
+      << prop.maxTexture2DLinear[2]
+      << "\n     Maximum 2D mipmapped texture dimensions: "
+      << prop.maxTexture2DMipmap[0] << " " << prop.maxTexture2DMipmap[1]
+      << "\n     Maximum 3D texture dimensions: " << prop.maxTexture3D[0] << " "
+      << prop.maxTexture3D[1] << " " << prop.maxTexture3D[2]
+      << "\n     Maximum alternate 3D texture dimensions: "
+      << prop.maxTexture3DAlt[0] << " " << prop.maxTexture3DAlt[1] << " "
+      << prop.maxTexture3DAlt[2]
+      << "\n     Maximum Cubemap texture dimensions: " << prop.maxTextureCubemap
+      << "\n     Maximum Cubemap layered texture dimensions: "
+      << prop.maxTextureCubemapLayered[0] << " "
+      << prop.maxTextureCubemapLayered[1]
+      << "\n     Alignment requirements for surfaces: " << prop.surfaceAlignment
+      << "\n     Alignment requirement for textures: " << prop.textureAlignment
+      << "\n     Pitch alignment requirement for texture references bound to pitched memory: "
+      << prop.texturePitchAlignment;
+  return oss.str();
+}
+
+inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     UUID: " << GetUUIDToString(prop.uuid.bytes)
+      << "\n     Unique identifier for a group of devices on the same multi-GPU board: "
+      << prop.multiGpuBoardGroupID
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n     PCI domain ID of the device: " << prop.pciDomainID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Shared memory available per multiprocessor in bytes: "
+      << prop.sharedMemPerMultiprocessor;
+  return oss.str();
+}
+
 inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
   auto flags = non_blocking ? cudaStreamNonBlocking : cudaStreamDefault;
   return cudaStreamCreateWithFlags(stream, flags);
@@ -62,6 +277,10 @@ inline DeviceError StreamDestroy(StreamType stream) {
   return cudaStreamDestroy(stream);
 }
 
+inline DeviceError StreamWaitEvent(StreamType stream, EventType event) {
+  return cudaStreamWaitEvent(stream, event);
+}
+
 inline DeviceError GraphInstantiate(
     GraphExecType* graph_exec,
     GraphType graph) {
@@ -115,6 +334,10 @@ inline DeviceError FreeDeviceMemory(Handle src) {
   return cudaFree(src);
 }
 
+inline DeviceError FreeDeviceHostMemory(Handle src) {
+  return cudaFreeHost(src);
+}
+
 inline DeviceError FreeDeviceMemoryAsync(Handle src, StreamType stream = 0) {
   return cudaFreeAsync(src, stream);
 }
@@ -123,6 +346,10 @@ inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
   return cudaMalloc(dst, size);
 }
 
+inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
+  return cudaMallocHost(dst, size);
+}
+
 inline DeviceError DeviceMallocAsync(
     Handle* dst,
     size_t size,
@@ -143,15 +370,17 @@ inline DeviceError GetLastError() {
 }
 
 inline std::string GetLastErrorString() {
-  return cudaGetErrorString(cudaGetLastError());
+  auto err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 inline DeviceError StreamSynchronize(StreamType stream) {
   return cudaStreamSynchronize(stream);
 }
 
-inline DeviceError CreateEvent(EventType* event) {
-  return cudaEventCreate(event);
+inline DeviceError CreateEvent(EventType* event, bool measure_time = true) {
+  return cudaEventCreateWithFlags(
+      event, measure_time ? cudaEventDefault : cudaEventDisableTiming);
 }
 
 inline DeviceError DestroyEvent(EventType event) {
@@ -174,7 +403,7 @@ inline DeviceError QueryEvent(EventType event) {
   return cudaEventQuery(event);
 }
 
-inline const char* GetErrorString(DeviceError err) {
+inline std::string GetErrorString(DeviceError err) {
   return cudaGetErrorString(err);
 }
 
@@ -182,4 +411,20 @@ inline DeviceError GetDeviceNotReady() {
   return cudaErrorNotReady;
 }
 
+inline DeviceError GetDriverVersion(int* driverVersion) {
+  return cudaDriverGetVersion(driverVersion);
+}
+
+inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
+  return cudaRuntimeGetVersion(runtimeVersion);
+}
+
+inline void ProfilerRangePush(const char* msg) {
+  nvtxRangePushA(msg);
+}
+
+inline void ProfilerRangePop() {
+  nvtxRangePop();
+}
+
 } // namespace ait
diff --git a/static/include/custom_math.cuh b/static/include/custom_math.cuh
new file mode 100644
index 000000000..acfaa2018
--- /dev/null
+++ b/static/include/custom_math.cuh
@@ -0,0 +1,1051 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#ifndef CUSTOM_MATH
+#define CUSTOM_MATH
+
+#ifndef __TO_UI
+#define __TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
+#endif
+
+#ifndef __TO_US
+#define __TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#endif
+
+#define NOT_IMPLEMENTED() assert(0 && __PRETTY_FUNCTION__)
+
+#define CUDA_FP16_ZERO \
+  __half {             \
+    0x0u               \
+  }
+#define CUDA_BF16_ZERO \
+  __nv_bfloat16 {      \
+    0x0u               \
+  }
+#define CUDA_FP162_ZERO \
+  __half2 {             \
+    0x0u, 0x0u          \
+  }
+#define CUDA_BF162_ZERO \
+  __nv_bfloat162 {      \
+    0x0u, 0x0u          \
+  }
+#define CUDA_FP16_ONE \
+  __half_raw {        \
+    0x3c00u           \
+  }
+#define CUDA_BF16_ONE \
+  __nv_bfloat16_raw { \
+    0x3f80u           \
+  }
+#define CUDA_FP16_ONE_HALF \
+  __half_raw {             \
+    0x3800u                \
+  }
+#define CUDA_BF16_ONE_HALF \
+  __nv_bfloat16_raw {      \
+    0x3f00u                \
+  }
+
+// sqrt(2 / pi)
+#define CUDA_BF16_K1  \
+  __nv_bfloat16_raw { \
+    0x3f4c            \
+  }
+
+// 2/(3*pi) - 1/6
+#define CUDA_BF16_K3  \
+  __nv_bfloat16_raw { \
+    0x3d3a            \
+  }
+
+template <typename T>
+__device__ T sign_custom(const T a) {
+  return T(a > T(0)) - T(a < T(0));
+}
+
+__device__ half2 h2sign_custom(const half2 a) {
+  return __hsub2(__hgt2(a, CUDA_FP162_ZERO), __hlt2(a, CUDA_FP162_ZERO));
+}
+
+__device__ bfloat16_2 h2sign_custom(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hsub2(__hgt2(a, CUDA_BF162_ZERO), __hlt2(a, CUDA_BF162_ZERO));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 fast_tanh(half2 x) {
+#if defined(AIT_USE_FAST_MATH)
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16x2 %0, %1;"
+               : "=r"(__TO_UI(x))
+               : "r"(__TO_UI(x)));
+  return x;
+
+#else
+  return half2(
+      {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
+#endif
+#else
+  return half2({tanhf(float(x.x)), tanhf(float(x.y))});
+#endif
+}
+
+__device__ half fast_tanh(half x) {
+#if defined(AIT_USE_FAST_MATH)
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 750)
+
+  asm volatile("tanh.approx.f16 %0, %1;" : "=h"(__TO_US(x)) : "h"(__TO_US(x)));
+  return x;
+
+#else
+  return half(cutlass::fast_tanh(float(x)));
+#endif
+#else
+  return half(tanhf(float(x)));
+#endif
+}
+
+__device__ bfloat16_2 fast_tanh(bfloat16_2 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
+
+  asm volatile("tanh.approx.bf16x2 %0, %1;"
+               : "=r"(__TO_UI(x))
+               : "r"(__TO_UI(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
+  return bfloat16_2(
+      {cutlass::fast_tanh(float(x.x)), cutlass::fast_tanh(float(x.y))});
+#else
+  return bfloat16_2({tanhf(float(x.x)), tanhf(float(x.y))});
+#endif
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16 fast_tanh(bfloat16 x) {
+#if defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 11) && \
+    (__CUDA_ARCH__ >= 900) && defined(AIT_USE_FAST_MATH)
+  asm volatile("tanh.approx.bf16 %0, %1;" : "=h"(__TO_US(x)) : "h"(__TO_US(x)));
+  return x;
+
+#elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+#if defined(AIT_USE_FAST_MATH)
+  return cutlass::fast_tanh(float(x));
+#else
+  return bfloat16(tanhf(float(x)));
+#endif
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fsigmoid_custom(const float a) {
+#if defined(AIT_USE_FAST_MATH)
+  return (cutlass::fast_tanh(a * 0.5f) + 1.0f) * 0.5f;
+#else
+  return 1.0f / (1.0f + expf(-a));
+#endif
+}
+
+__device__ half hsigmoid_custom(const half a) {
+#if defined(AIT_USE_FAST_MATH)
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_FP16_ONE_HALF)), CUDA_FP16_ONE)),
+      CUDA_FP16_ONE_HALF);
+#else
+  return half(1.0f / (1.0f + expf(float(-a))));
+#endif
+}
+
+__device__ half2 h2sigmoid_custom(const half2 a) {
+#if defined(AIT_USE_FAST_MATH)
+  const auto halfX2 = half2(CUDA_FP16_ONE_HALF, CUDA_FP16_ONE_HALF);
+  const auto oneX2 = half2(CUDA_FP16_ONE, CUDA_FP16_ONE);
+  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+#else
+  return half2(
+      1.0f / (1.0f + expf(float(-a.x))), 1.0f / (1.0f + expf(float(-a.y))));
+#endif
+}
+
+__device__ bfloat16 hsigmoid_custom(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+#if defined(AIT_USE_FAST_MATH)
+  return __hmul(
+      (__hadd(fast_tanh(__hmul(a, CUDA_BF16_ONE_HALF)), CUDA_BF16_ONE)),
+      CUDA_BF16_ONE_HALF);
+#else
+  return bfloat16(1.0f / (1.0f + expf(float(-a))));
+#endif
+
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 h2sigmoid_custom(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+#if defined(AIT_USE_FAST_MATH)
+  const auto halfX2 = bfloat16_2(CUDA_BF16_ONE_HALF, CUDA_BF16_ONE_HALF);
+  const auto oneX2 = bfloat16_2(CUDA_BF16_ONE, CUDA_BF16_ONE);
+  return __hmul2((__hadd2(fast_tanh(__hmul2(a, halfX2)), oneX2)), halfX2);
+#else
+  return bfloat16_2(
+      1.0f / (1.0f + expf(float(-a.x))), 1.0f / (1.0f + expf(float(-a.y))));
+#endif
+
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fsilu(const float a) {
+  return a * fsigmoid_custom(a);
+}
+
+__device__ half hsilu(const half a) {
+  return __hmul(a, hsigmoid_custom(a));
+}
+
+__device__ bfloat16 hsilu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(a, hsigmoid_custom(a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 h2silu(const half2 a) {
+  return __hmul2(a, h2sigmoid_custom(a));
+}
+
+__device__ bfloat16_2 h2silu(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul2(a, h2sigmoid_custom(a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float leaky_relu(const float a, const float negativeSlope) {
+  return a > 0.f ? a : a * negativeSlope;
+}
+
+__device__ half leaky_relu(const half a, const half negativeSlope) {
+  return a > half(0.f) ? a : __hmul(a, negativeSlope);
+}
+
+__device__ bfloat16 leaky_relu(const bfloat16 a, const bfloat16 negativeSlope) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return a > bfloat16(0.f) ? a : __hmul(a, negativeSlope);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 leaky_relu(const half2 a, const half2 negativeSlope) {
+  return half2(
+      leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
+}
+
+__device__ bfloat16_2
+leaky_relu(const bfloat16_2 a, const bfloat16_2 negativeSlope) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(
+      leaky_relu(a.x, negativeSlope.x), leaky_relu(a.y, negativeSlope.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float relu(const float a) {
+  return fmaxf(a, 0.f);
+}
+
+__device__ half relu(const half a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(a, CUDA_FP16_ZERO);
+#else
+  return a > CUDA_FP16_ZERO ? a : CUDA_FP16_ZERO;
+#endif
+}
+
+__device__ half2 relu(const half2 a) {
+  const half2 zeroX2 = half2(CUDA_FP16_ZERO, CUDA_FP16_ZERO);
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(a, zeroX2);
+#else
+  return half2(relu(a.x), relu(a.y));
+#endif
+}
+
+__device__ bfloat16 relu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(a, CUDA_BF16_ZERO);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 relu(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(a, CUDA_BF162_ZERO);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16
+hard_tanh(const bfloat16 a, const bfloat16 min_val, const bfloat16 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(min_val, __hmin(max_val, a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half
+hard_tanh(const half a, const half min_val, const half max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax(min_val, __hmin(max_val, a));
+#else
+  return a > max_val ? max_val : a < min_val ? min_val : a;
+#endif
+}
+
+__device__ float hard_tanh(
+    const float a,
+    const float min_val,
+    const float max_val) {
+  return fmaxf(min_val, fminf(max_val, a));
+}
+
+__device__ half2
+h2hard_tanh(const half2 a, const half2 min_val, const half2 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(min_val, __hmin2(max_val, a));
+#else
+  return half2(
+      hard_tanh(a.x, min_val.x, max_val.x),
+      hard_tanh(a.y, min_val.y, max_val.y));
+#endif
+}
+
+__device__ bfloat16_2 h2hard_tanh(
+    const bfloat16_2 a,
+    const bfloat16_2 min_val,
+    const bfloat16_2 max_val) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(min_val, __hmin2(max_val, a));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half replace_if_inf(
+    const half a,
+    const half inf_replace,
+    const half neginf_replace) {
+  auto is_inf = __hisinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+}
+
+__device__ bfloat16 replace_if_inf(
+    const bfloat16 a,
+    const bfloat16 inf_replace,
+    const bfloat16 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto is_inf = __hisinf(a);
+  if (is_inf == -1) {
+    return neginf_replace;
+  }
+  if (is_inf == 1) {
+    return inf_replace;
+  }
+  return a;
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float replace_if_inf(
+    const float a,
+    const float inf_replace,
+    const float neginf_replace) {
+  if (isinf(a)) {
+    return (a > 0) ? inf_replace : neginf_replace;
+  }
+  return a;
+}
+
+__device__ half2 nan_to_num(
+    const half2 a,
+    const half2 nan_replace,
+    const half2 inf_replace,
+    const half2 neginf_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      isnan.x ? nan_replace.x
+              : replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
+      isnan.y ? nan_replace.y
+              : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
+}
+
+__device__ bfloat16_2 nan_to_num(
+    const bfloat16_2 a,
+    const bfloat16_2 nan_replace,
+    const bfloat16_2 inf_replace,
+    const bfloat16_2 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  bfloat16_2 isnan = __hisnan2(a);
+  return bfloat16_2(
+      isnan.x ? nan_replace.x
+              : replace_if_inf(a.x, inf_replace.x, neginf_replace.x),
+      isnan.y ? nan_replace.y
+              : replace_if_inf(a.y, inf_replace.y, neginf_replace.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half nan_to_num(
+    const half a,
+    const half nan_replace,
+    const half inf_replace,
+    const half neginf_replace) {
+  if (__hisnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ bfloat16 nan_to_num(
+    const bfloat16 a,
+    const bfloat16 nan_replace,
+    const bfloat16 inf_replace,
+    const bfloat16 neginf_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  if (__hisnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float nan_to_num(
+    const float a,
+    const float nan_replace,
+    const float inf_replace,
+    const float neginf_replace) {
+  if (isnan(a)) {
+    return nan_replace;
+  }
+  return replace_if_inf(a, inf_replace, neginf_replace);
+}
+
+__device__ half2 clamp_nan_to_num(
+    const half2 a,
+    const half2 clamp_min,
+    const half2 clamp_max,
+    const half2 nan_replace) {
+  half2 isnan = __hisnan2(a);
+  return half2(
+      isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
+      isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
+}
+
+__device__ bfloat16_2 clamp_nan_to_num(
+    const bfloat16_2 a,
+    const bfloat16_2 clamp_min,
+    const bfloat16_2 clamp_max,
+    const bfloat16_2 nan_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto isnan = __hisnan2(a);
+  return bfloat16_2(
+      isnan.x ? nan_replace.x : hard_tanh(a.x, clamp_min.x, clamp_max.x),
+      isnan.y ? nan_replace.y : hard_tanh(a.y, clamp_min.y, clamp_max.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half clamp_nan_to_num(
+    const half a,
+    const half clamp_min,
+    const half clamp_max,
+    const half nan_replace) {
+  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+__device__ bfloat16 clamp_nan_to_num(
+    const bfloat16 a,
+    const bfloat16 clamp_min,
+    const bfloat16 clamp_max,
+    const bfloat16 nan_replace) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hisnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float clamp_nan_to_num(
+    const float a,
+    const float clamp_min,
+    const float clamp_max,
+    const float nan_replace) {
+  return isnan(a) ? nan_replace : hard_tanh(a, clamp_min, clamp_max);
+}
+
+// Backup functions for CUDA_ARCH < 800
+__device__ half nanh() {
+  return __float2half(nanf(""));
+}
+
+__device__ bool half_isnan(half h) {
+  return h != h;
+}
+
+__device__ half hmin(half a, half b) {
+  return (a < b) ? a : b;
+}
+
+__device__ half hmax(half a, half b) {
+  return (a > b) ? a : b;
+}
+
+// max/min functions that let NaNs pass through
+__device__ float fmaxf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fmaxf(a, b);
+}
+
+__device__ half hmax_nan(const half a, const half b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax_nan(a, b);
+#else
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmax(a, b);
+#endif
+}
+
+__device__ bfloat16 hmax_nan(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 hmax2_nan(const half2 a, const half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2_nan(a, b);
+#else
+  return half2(hmax_nan(a.x, b.x), hmax_nan(a.y, b.y));
+#endif
+}
+
+__device__ bfloat16_2 hmax2_nan(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fminf_nan(const float a, const float b) {
+  return (isnan(a) || isnan(b)) ? nanf("") : fminf(a, b);
+}
+
+__device__ half hmin_nan(const half a, const half b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin_nan(a, b);
+#else
+  return (half_isnan(a) || half_isnan(b)) ? nanh() : hmin(a, b);
+#endif
+}
+
+__device__ bfloat16 hmin_nan(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 hmin2_nan(const half2 a, const half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin2_nan(a, b);
+#else
+  return half2(hmin_nan(a.x, b.x), hmin_nan(a.y, b.y));
+#endif
+}
+
+__device__ bfloat16_2 hmin2_nan(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin2_nan(a, b);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+// pow impl
+__device__ half hpow(const half a, const half b);
+__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b);
+
+__device__ half2 h2pow(const half2 a, const half2 b) {
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  if (b1 != b2) {
+    half a1 = __low2half(a);
+    half a2 = __high2half(a);
+    half c1 = hpow(a1, b1);
+    half c2 = hpow(a2, b2);
+    return __halves2half2(c1, c2);
+  }
+
+  // New special cases can be added if needed, such as
+  // an powi for cases where b is an integer
+  if (__hbeq2(b, half2(0.0, 0.0))) {
+    return half2(1.0, 1.0);
+  }
+  if (__hbeq2(b, half2(1.0, 1.0))) {
+    return a;
+  }
+  if (__hbeq2(b, half2(2.0, 2.0))) {
+    return __hmul2(a, a);
+  }
+  if (__hbeq2(b, half2(3.0, 3.0))) {
+    return __hmul2(__hmul2(a, a), a);
+  }
+  if (__hbeq2(b, half2(0.5, 0.5))) {
+    return h2sqrt(a);
+  }
+  if (__hbeq2(b, half2(-0.5, -0.5))) {
+    return h2rsqrt(a);
+  }
+  if (__hbeq2(b, half2(-1.0, -1.0))) {
+    return __h2div(half2(1.0, 1.0), a);
+  }
+  if (__hbeq2(b, half2(-2.0, -2.0))) {
+    return __h2div(half2(1.0, 1.0), __hmul2(a, a));
+  }
+
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+
+  // low 16 bits
+  half c1 =
+      static_cast<half>(pow(static_cast<double>(a1), static_cast<double>(b1)));
+  // high 16 bits
+  half c2 =
+      static_cast<half>(pow(static_cast<double>(a2), static_cast<double>(b2)));
+  return __halves2half2(c1, c2);
+}
+
+__device__ bfloat16_2 h2pow(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto b1 = __low2bfloat16(b);
+  auto b2 = __high2bfloat16(b);
+  if (b1 != b2) {
+    auto a1 = __low2bfloat16(a);
+    auto a2 = __high2bfloat16(a);
+    auto c1 = hpow(a1, b1);
+    auto c2 = hpow(a2, b2);
+    return __halves2bfloat162(c1, c2);
+  }
+
+  // New special cases can be added if needed, such as
+  // an powi for cases where b is an integer
+  if (__hbeq2(b, bfloat16_2(0.0, 0.0))) {
+    return bfloat16_2(1.0, 1.0);
+  }
+  if (__hbeq2(b, bfloat16_2(1.0, 1.0))) {
+    return a;
+  }
+  if (__hbeq2(b, bfloat16_2(2.0, 2.0))) {
+    return __hmul2(a, a);
+  }
+  if (__hbeq2(b, bfloat16_2(3.0, 3.0))) {
+    return __hmul2(__hmul2(a, a), a);
+  }
+  if (__hbeq2(b, bfloat16_2(0.5, 0.5))) {
+    return h2sqrt(a);
+  }
+  if (__hbeq2(b, bfloat16_2(-0.5, -0.5))) {
+    return h2rsqrt(a);
+  }
+  if (__hbeq2(b, bfloat16_2(-1.0, -1.0))) {
+    return __h2div(bfloat16_2(1.0, 1.0), a);
+  }
+  if (__hbeq2(b, bfloat16_2(-2.0, -2.0))) {
+    return __h2div(bfloat16_2(1.0, 1.0), __hmul2(a, a));
+  }
+
+  auto a1 = __low2bfloat16(a);
+  auto a2 = __high2bfloat16(a);
+
+  // low 16 bits
+  auto c1 = static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a1)),
+          static_cast<double>(__bfloat162float(b1))));
+  // high 16 bits
+  auto c2 = static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a2)),
+          static_cast<double>(__bfloat162float(b2))));
+  return __halves2bfloat162(c1, c2);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half hpow(const half a, const half b) {
+  if (b == half(0.0)) {
+    return half(1.0);
+  }
+  if (b == half(1.0)) {
+    return a;
+  }
+  if (b == half(2.0)) {
+    return a * a;
+  }
+  if (b == half(3.0)) {
+    return a * a * a;
+  }
+  if (b == half(0.5)) {
+    return hsqrt(a);
+  }
+  if (b == half(-0.5)) {
+    return hrsqrt(a);
+  }
+  if (b == half(-1.0)) {
+    return half(1.0) / a;
+  }
+  if (b == half(-2.0)) {
+    return half(1.0) / (a * a);
+  }
+  return static_cast<half>(pow(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__device__ bfloat16 hpow(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  if (b == bfloat16(0.0)) {
+    return bfloat16(1.0);
+  }
+  if (b == bfloat16(1.0)) {
+    return a;
+  }
+  if (b == bfloat16(2.0)) {
+    return a * a;
+  }
+  if (b == bfloat16(3.0)) {
+    return a * a * a;
+  }
+  if (b == bfloat16(0.5)) {
+    return hsqrt(a);
+  }
+  if (b == bfloat16(-0.5)) {
+    return hrsqrt(a);
+  }
+  if (b == bfloat16(-1.0)) {
+    return bfloat16(1.0) / a;
+  }
+  if (b == bfloat16(-2.0)) {
+    return bfloat16(1.0) / (a * a);
+  }
+  return static_cast<bfloat16>(
+      pow(static_cast<double>(__bfloat162float(a)),
+          static_cast<double>(__bfloat162float(b))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fpow(const float a, const float b) {
+  if (b == float(0.0)) {
+    return float(1.0);
+  }
+  if (b == float(1.0)) {
+    return a;
+  }
+  if (b == float(2.0)) {
+    return a * a;
+  }
+  if (b == float(3.0)) {
+    return a * a * a;
+  }
+  if (b == float(0.5)) {
+    return sqrt(a);
+  }
+  if (b == float(-0.5)) {
+    return rsqrt(a);
+  }
+  if (b == float(-1.0)) {
+    return float(1.0) / a;
+  }
+  if (b == float(-2.0)) {
+    return float(1.0) / (a * a);
+  }
+  return static_cast<float>(
+      pow(static_cast<double>(a), static_cast<double>(b)));
+}
+
+//
+// GELU function definitions implemented as described by
+//   Hendrycks, D., and Gimpel, K. in
+//   "Gaussian Error Linear Units (GELUs)." (2020)
+//   https://arxiv.org/pdf/1606.08415.pdf
+//
+// Floating-point constants are Taylor coefficients described in the paper.
+//
+__device__ half hgelu(const half a) {
+  cutlass::epilogue::thread::GELU<cutlass::half_t> gelu_op;
+  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
+}
+
+__device__ bfloat16 hgelu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(
+      a,
+      __hmul(
+          CUDA_BF16_ONE_HALF,
+          __hadd(
+              CUDA_BF16_ONE,
+              bfloat16(erff(__bfloat162float(a) * rsqrtf(2.f))))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fgelu(const float a) {
+  return a * .5f * (1.f + erff(a * rsqrtf(2.f)));
+}
+
+__device__ half h_fast_gelu(const half a) {
+  cutlass::epilogue::thread::GELU_taylor<cutlass::half_t> gelu_op;
+  return static_cast<half>(gelu_op(static_cast<cutlass::half_t>(a)));
+}
+
+// The CUDA_BF16_K3 constant in the linked paper
+// (https://arxiv.org/pdf/1606.08415.pdf) (=0.044715) slightly differs
+// from the one computed analytically (2/(3*pi) - 1/6) ~ 0.045539):
+//   atanh(x) = x + x^3/3 + O(x^5),
+//   erf(x/sqrt(2)) = sqrt(2/pi)*(x - x^3/6 + O(x^5)),
+//   atanh(erf(x/sqrt(2))) = sqrt(2/pi)*x +
+//   + (sqrt(2/pi)*x)^3/3 - (sqrt(2/pi)/6)*x^3 + O(x^5) =
+//   = sqrt(2/pi)*x*(1 + (2/(3*pi) - 1/6)*x^2 + O(x^4)).
+// The Cutlass folks have hardcoded the constant from the paper.
+__device__ bfloat16 h_fast_gelu(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul(
+      a,
+      __hmul(
+          CUDA_BF16_ONE_HALF,
+          __hadd(
+              CUDA_BF16_ONE,
+              fast_tanh(
+                  __hmul(CUDA_BF16_K1, a) *
+                  __hadd(CUDA_BF16_ONE, __hmul(CUDA_BF16_K3, a * a))))));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float f_fast_gelu(const float a) {
+  cutlass::epilogue::thread::GELU_taylor<float> gelu_op;
+  return gelu_op(a);
+}
+
+__device__ float fsoftplus(
+    const float a,
+    const float beta,
+    const float threshold) {
+  return (a * beta > threshold) ? a : log1pf(expf(a * beta)) / beta;
+}
+
+__device__ half hsoftplus(const half a, const half beta, const half threshold) {
+  return __hgt(__hmul(a, beta), threshold)
+      ? a
+      : __hdiv(hlog(__hadd(CUDA_FP16_ONE, hexp(__hmul(a, beta)))), beta);
+}
+
+__device__ bfloat16
+hsoftplus(const bfloat16 a, const bfloat16 beta, const bfloat16 threshold) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(__hmul(a, beta), threshold)
+      ? a
+      : __hdiv(hlog(__hadd(CUDA_BF16_ONE, hexp(__hmul(a, beta)))), beta);
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2
+h2softplus(const half2 a, const half2 beta, const half2 threshold) {
+  return half2(
+      hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
+}
+
+__device__ bfloat16_2 h2softplus(
+    const bfloat16_2 a,
+    const bfloat16_2 beta,
+    const bfloat16_2 threshold) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(
+      hsoftplus(a.x, beta.x, threshold.x), hsoftplus(a.y, beta.y, threshold.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float felu(const float op_input, const float alpha) {
+  return op_input > 0.f ? op_input : alpha * (expf(op_input) - 1.0f);
+}
+
+__device__ half helu(const half op_input, const half alpha) {
+  return __hgt(op_input, CUDA_FP16_ZERO)
+      ? op_input
+      : __hmul(alpha, __hsub(hexp(op_input), CUDA_FP16_ONE));
+}
+
+__device__ bfloat16 helu(const bfloat16 op_input, const bfloat16 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(op_input, CUDA_BF16_ZERO)
+      ? op_input
+      : __hmul(alpha, __hsub(hexp(op_input), CUDA_BF16_ONE));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 h2elu(const half2 op_input, const half2 alpha) {
+  return half2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
+}
+
+__device__ bfloat16_2 h2elu(const bfloat16_2 op_input, const bfloat16_2 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(helu(op_input.x, alpha.x), helu(op_input.y, alpha.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half hsoftsign(const half a) {
+  return __hdiv(a, __hadd(CUDA_FP16_ONE, __habs(a)));
+}
+
+__device__ half2 h2softsign(const half2 a) {
+  return __h2div(a, __hadd2(half2(1.0, 1.0), __habs2(a)));
+}
+
+__device__ float fsoftsign(const float a) {
+  return a / (1.0f + fabsf(a));
+}
+
+__device__ bfloat16 hsoftsign(const bfloat16 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hdiv(a, __hadd(CUDA_BF16_ONE, __habs(a)));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ bfloat16_2 h2softsign(const bfloat16_2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __h2div(a, __hadd2(bfloat16_2(1.0, 1.0), __habs2(a)));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float floor_div(const float a, const float b) {
+  return floor(a / b);
+}
+
+__device__ half floor_div(const half a, const half b) {
+  return hfloor(__hdiv(a, b));
+}
+
+__device__ bfloat16 floor_div(const bfloat16 a, const bfloat16 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return hfloor(__hdiv(a, b));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 floor_div(const half2 a, const half2 b) {
+  return half2(floor_div(a.x, b.x), floor_div(a.y, b.y));
+}
+
+__device__ bfloat16_2 floor_div(const bfloat16_2 a, const bfloat16_2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(floor_div(a.x, b.x), floor_div(a.y, b.y));
+
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ float fcelu(const float a, const float alpha) {
+  return a > 0.f ? a : alpha * (expf(a / alpha) - 1.0f);
+}
+
+__device__ half hcelu(const half a, const half alpha) {
+  return __hgt(a, CUDA_FP16_ZERO)
+      ? a
+      : __hmul(alpha, __hsub(hexp(__hdiv(a, alpha)), CUDA_FP16_ONE));
+}
+
+__device__ bfloat16 hcelu(const bfloat16 a, const bfloat16 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hgt(a, CUDA_BF16_ZERO)
+      ? a
+      : __hmul(alpha, __hsub(hexp(__hdiv(a, alpha)), CUDA_BF16_ONE));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+__device__ half2 h2celu(const half2 a, const half2 alpha) {
+  return half2(hcelu(a.x, alpha.x), hcelu(a.y, alpha.y));
+}
+
+__device__ bfloat16_2 h2celu(const bfloat16_2 a, const bfloat16_2 alpha) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return bfloat16_2(hcelu(a.x, alpha.x), hcelu(a.y, alpha.y));
+#else
+  NOT_IMPLEMENTED();
+#endif
+}
+
+#endif
diff --git a/python/aitemplate/backend/rocm/elementwise/custom_math.h b/static/include/custom_math.h
similarity index 97%
rename from python/aitemplate/backend/rocm/elementwise/custom_math.h
rename to static/include/custom_math.h
index caa84c424..72258cbbf 100644
--- a/python/aitemplate/backend/rocm/elementwise/custom_math.h
+++ b/static/include/custom_math.h
@@ -15,6 +15,17 @@
 #ifndef CUSTOM_MATH
 #define CUSTOM_MATH
 
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_runtime.h>
+
+using bfloat16 = hip_bfloat16;
+
+
+#include <hip/math_functions.h>
+#include <hip/device_functions.h>
+
+
 #ifndef __HALF2_TO_UI
 #define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int*>(&(var)))
 #endif
diff --git a/static/include/debug_utility.h b/static/include/debug_utility.h
index d5f7ce65c..cfbbf57b1 100644
--- a/static/include/debug_utility.h
+++ b/static/include/debug_utility.h
@@ -14,6 +14,21 @@
 #pragma once
 #include "device_functions-generated.h"
 
+namespace {
+template <typename T>
+__global__ void outputs_checker(const T* tensor, int64_t elem_cnt) {
+  for (int64_t i = 0; i < elem_cnt; i++) {
+    float v = (float)(*(tensor + i));
+    if (i != 0) {
+      printf(", ");
+    }
+    printf("%f", v);
+  }
+  printf("\n");
+}
+
+} // namespace
+
 namespace ait {
 void InvokeInfAndNanChecker(
     const half* tensor,
@@ -21,9 +36,14 @@ void InvokeInfAndNanChecker(
     int64_t elem_cnt,
     ait::StreamType stream);
 
+template <typename T>
 void InvokeOutputsChecker(
-    const half* tensor,
+    const T* tensor,
     const char* tensor_name,
     int64_t elem_cnt,
-    ait::StreamType stream);
+    ait::StreamType stream) {
+  printf("Tensor (%s) output:\n", tensor_name);
+  outputs_checker<<<1, 1, 0, stream>>>(tensor, elem_cnt);
+  ait::StreamSynchronize(stream);
+}
 } // namespace ait
diff --git a/static/include/jagged.h b/static/include/jagged.h
new file mode 100644
index 000000000..99e94f467
--- /dev/null
+++ b/static/include/jagged.h
@@ -0,0 +1,36 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+namespace ait {
+
+// This structure is used to pack the offset metadata related to a
+// jagged Tensor's first dimension: JaggedIntVar. The offsets are not
+// available in compile time, as they are coming in a rank-1 Tensor.
+// In runtime, the members of the structure are set by the make_jagged
+// op's back-end, from the corresponding rank-1 offset Tensors' length
+// and data. The OFFSET_TYPE can be either int32 or int64. The number
+// of offset arrays is known in compile time, hence specified as the
+// NUM_OFFSET_ARRAYS template argument here.
+template <typename OFFSET_TYPE, int32_t NUM_OFFSET_ARRAYS>
+struct JaggedOffsets {
+  // the lengths the individual offset arrays
+  int64_t lengths[NUM_OFFSET_ARRAYS]{0};
+  // the data in each of the offset arrays
+  // (i.e., the offsets of the JaggedIntVar)
+  const OFFSET_TYPE* data[NUM_OFFSET_ARRAYS]{nullptr};
+};
+
+} // namespace ait
diff --git a/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
new file mode 100644
index 000000000..f5b4e6bdd
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -0,0 +1,444 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+
+#include "classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "classic_b2b_bmm/kernel/default_b2b_batched_gemm.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0 = false,
+    /// Stage accumulator in shared memory
+    bool SmemAccumulator = false,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator>
+class B2bGemmBatched {
+ public:
+
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using LayoutB1 = LayoutB1_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape0 = ThreadblockShape0_;
+  using ThreadblockShape1 = ThreadblockShape1_;
+  using WarpShape0 = WarpShape0_;
+  using WarpShape1 = WarpShape1_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kCausalMaskAfterGemm0 = CausalMaskAfterGemm0;
+
+  /// Derived types
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor;
+
+  /// Define the kernel
+  using B2bGemmBatchedKernel = typename kernel::DefaultB2bGemmBatched<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    LayoutB1,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    CausalMaskAfterGemm0,
+    SmemAccumulator
+  >::B2bGemmBatchedKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size_0;
+    GemmCoord problem_size_1;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    TensorRef<ElementB const, LayoutB> ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    TensorRef<ElementB const, LayoutB1> ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() {
+
+    }
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      int64_t head_stride_A0_,
+      int64_t batch_stride_A0_,
+      TensorRef<ElementB const, LayoutB> ref_B0_,
+      int64_t head_stride_B0_,
+      int64_t batch_stride_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      int64_t head_stride_C0_,
+      int64_t batch_stride_C0_,
+      TensorRef<ElementB const, LayoutB1> ref_B1_,
+      int64_t head_stride_B1_,
+      int64_t batch_stride_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      int64_t head_stride_C1_,
+      int64_t batch_stride_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      int64_t head_stride_D1_,
+      int64_t batch_stride_D1_,
+      int batch_count_,
+      int num_heads_,
+      typename EpilogueOutputOp0::Params epilogue0_ =
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ =
+        typename EpilogueOutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      head_stride_A0(head_stride_A0_),
+      batch_stride_A0(batch_stride_A0_),
+      ref_B0(ref_B0_),
+      head_stride_B0(head_stride_B0_),
+      batch_stride_B0(batch_stride_B0_),
+      ref_C0(ref_C0_),
+      head_stride_C0(head_stride_C0_),
+      batch_stride_C0(batch_stride_C0_),
+      ref_B1(ref_B1_),
+      head_stride_B1(head_stride_B1_),
+      batch_stride_B1(batch_stride_B1_),
+      ref_C1(ref_C1_),
+      head_stride_C1(head_stride_C1_),
+      batch_stride_C1(batch_stride_C1_),
+      ref_D1(ref_D1_),
+      head_stride_D1(head_stride_D1_),
+      batch_stride_D1(batch_stride_D1_),
+      batch_count(batch_count_),
+      num_heads(num_heads_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_) {
+
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename B2bGemmBatchedKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  B2bGemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = B2bGemmBatchedKernel::can_implement(
+      args.problem_size_0,
+      args.problem_size_1,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0,
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.batch_count * args.num_heads);
+
+    // Initialize the Params structure
+    params_ = typename B2bGemmBatchedKernel::Params{
+      args.problem_size_0,
+      args.problem_size_1,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.head_stride_A0,
+      args.batch_stride_A0,
+      args.ref_B0.non_const_ref(),
+      args.head_stride_B0,
+      args.batch_stride_B0,
+      args.ref_C0.non_const_ref(),
+      args.head_stride_C0,
+      args.batch_stride_C0,
+      args.ref_B1.non_const_ref(),
+      args.head_stride_B1,
+      args.batch_stride_B1,
+      args.ref_C1.non_const_ref(),
+      args.head_stride_C1,
+      args.batch_stride_C1,
+      args.ref_D1,
+      args.head_stride_D1,
+      args.batch_stride_D1,
+      args.batch_count,
+      args.num_heads,
+      args.epilogue0,
+      args.epilogue1
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
+    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D1.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(B2bGemmBatchedKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename B2bGemmBatchedKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<B2bGemmBatchedKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<B2bGemmBatchedKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
new file mode 100644
index 000000000..8828b0725
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -0,0 +1,431 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  typename GmemToAccumLoader_
+>
+struct B2bGemmBatched {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using GmemToAccumLoader = GmemToAccumLoader_;
+  using OutputOp0 = typename B2bMma::OutputOp;
+  using OutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size_0;
+    cutlass::gemm::GemmCoord problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename B2bMma::IteratorA0::Params params_A0;
+    typename B2bMma::IteratorA0::TensorRef ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    typename B2bMma::IteratorB0::Params params_B0;
+    typename B2bMma::IteratorB0::TensorRef ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    typename GmemToAccumLoader::OutputTileIterator::Params params_C0;
+    typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    typename B2bMma::IteratorB1::Params params_B1;
+    typename B2bMma::IteratorB1::TensorRef ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    typename Epilogue::OutputTileIterator::Params params_D1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+    int gemm_k_iterations_0;
+    int gemm_k_iterations_1;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      int64_t head_stride_A0,
+      int64_t batch_stride_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      int64_t head_stride_B0,
+      int64_t batch_stride_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      int64_t head_stride_C0,
+      int64_t batch_stride_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      int64_t head_stride_B1,
+      int64_t batch_stride_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      int64_t head_stride_C1,
+      int64_t batch_stride_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      int64_t head_stride_D1,
+      int64_t batch_stride_D1,
+      int batch_count,
+      int num_heads,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      head_stride_A0(head_stride_A0),
+      batch_stride_A0(batch_stride_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      head_stride_B0(head_stride_B0),
+      batch_stride_B0(batch_stride_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      head_stride_C0(head_stride_C0),
+      batch_stride_C0(batch_stride_C0),
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      head_stride_B1(head_stride_B1),
+      batch_stride_B1(batch_stride_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      head_stride_C1(head_stride_C1),
+      batch_stride_C1(batch_stride_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      head_stride_D1(head_stride_D1),
+      batch_stride_D1(batch_stride_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      batch_count(batch_count),
+      num_heads(num_heads),
+      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK),
+      gemm_k_iterations_1((problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+    typename GmemToAccumLoader::SharedStorage gmem_to_accum_loader;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  B2bGemmBatched() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1) {
+
+    static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements;
+    static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) ||
+      (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) ||
+      (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) ||
+      (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) ||
+      (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) ||
+      (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine if fusion sizes are valid
+    if(problem_size_0.m() != problem_size_1.m())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() != problem_size_1.k())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() > B2bMma::Shape0::kN)
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_1.n() > B2bMma::Shape1::kN)
+      return Status::kErrorInvalidProblem;
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_head_idx = threadblock_swizzle.get_batch_idx(); batch_head_idx < params.batch_count * params.num_heads; batch_head_idx += gridDim.z) {
+
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A0{
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B0{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      };
+
+      cutlass::MatrixCoord tb_offset_B1{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      };
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Convert blockIdx.z into (batch_idx, head_idx).
+      int batch_idx = batch_head_idx / params.num_heads;
+      int head_idx = batch_head_idx % params.num_heads;
+
+      // Construct iterators to A and B operands
+      typename B2bMma::IteratorA0 iterator_A0(
+        params.params_A0,
+        params.ref_A0.data(),
+        params.problem_size_0.mk(),
+        thread_idx,
+        tb_offset_A0);
+
+      iterator_A0.add_pointer_offset(params.batch_stride_A0 * batch_idx + params.head_stride_A0 * head_idx);
+
+      typename B2bMma::IteratorB0 iterator_B0(
+        params.params_B0,
+        params.ref_B0.data(),
+        params.problem_size_0.kn(),
+        thread_idx,
+        tb_offset_B0);
+
+      iterator_B0.add_pointer_offset(params.batch_stride_B0 * batch_idx + params.head_stride_B0 * head_idx);
+
+      typename B2bMma::IteratorB1 iterator_B1(
+        params.params_B1,
+        params.ref_B1.data(),
+        params.problem_size_1.kn(),
+        thread_idx,
+        tb_offset_B1);
+
+      iterator_B1.add_pointer_offset(params.batch_stride_B1 * batch_idx + params.head_stride_B1 * head_idx);
+
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int lane_idx = threadIdx.x % 32;
+
+      // assume identity swizzle
+      MatrixCoord tb_offset_C0(
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename GmemToAccumLoader::OutputTileIterator iterator_C0(
+        params.params_C0,
+        params.ref_C0.data(),
+        params.problem_size_0.mn(),
+        thread_idx,
+        tb_offset_C0
+      );
+
+      iterator_C0.add_pointer_offset(params.batch_stride_C0 * batch_idx + params.head_stride_C0 * head_idx);
+
+      //
+      // Main loop
+      //
+
+      OutputOp0 output_op_0(params.output_op_0);
+
+      // Construct thread-scoped matrix multiply
+      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx, params.problem_size_0.n());
+
+      typename B2bMma::FragmentC0 src_accum;
+      typename B2bMma::FragmentC1 accumulators;
+
+      src_accum.clear();
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      b2bMma(params.gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_C0,
+        iterator_B1, src_accum, output_op_0);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp1 output_op_1(params.output_op_1);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * B2bMma::Shape1::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        params.ref_C1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C1.add_pointer_offset(params.batch_stride_C1 * batch_idx + params.head_stride_C1 * head_idx);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D1(
+        params.params_D1,
+        params.ref_D1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D1.add_pointer_offset(params.batch_stride_D1 * batch_idx + params.head_stride_D1 * head_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
new file mode 100644
index 000000000..7a4a3cc12
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -0,0 +1,222 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+      This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "classic_b2b_bmm/threadblock/default_b2b_mma.h"
+#include "classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Layout type for B1 matrix operand
+  typename LayoutB1_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Apply upper triangular causal mask after first gemm
+  bool CausalMaskAfterGemm0 = false,
+  /// Stage accumulator in shared memory
+  bool SmemAccumulator = false
+>
+struct DefaultB2bGemmBatched;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0>
+struct DefaultB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, CausalMaskAfterGemm0> {
+
+  // TODO: Make pipelined (i.e. stages == 2) work.
+  static_assert((Stages >= 3), "Currently, only multistage is supported (not pipelined).");
+
+  // While we ought to debug it, the warp shape M restriction is not considered
+  // high-priority as we do not want to make warp M much larger anyway.
+  static_assert(
+    !CausalMaskAfterGemm0 || (WarpShape0::kM == 16),
+    "Currently, causal mask is only supported with warp shape M of 16."
+  );
+
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1,
+      ElementC, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK0 = ThreadblockShape0::kK / WarpShape0::kK;
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::classic_b2b_bmm::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using B2bGemmBatchedKernel = kernel::B2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h b/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h
new file mode 100644
index 000000000..b513f958a
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/thread/linear_combination_triu.h
@@ -0,0 +1,136 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include <cutlass/half.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  int ThreadBlockShapeM,
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationTriu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static int const kThreadBlockShapeM = ThreadBlockShapeM;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTriu() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, int index, int n, int m) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate = converted_accumulator;
+
+    for (int i = 0; i < kCount; i++) {
+
+      int row = (
+        (kThreadBlockShapeM * blockIdx.x) +
+        (16 * (threadIdx.x / 32)) +
+        (8 * (i / 2)) +
+        ((threadIdx.x % 32) / 4)
+      );
+      int col = (
+        (16 * index) +
+        (8 * n) +
+        (2 * (threadIdx.x % 4)) +
+        (i % 2)
+      );
+
+      intermediate[i] = intermediate[i] * ElementCompute(row <= col);
+
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+}
+}
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h
new file mode 100644
index 000000000..d2460cce9
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_base.h
@@ -0,0 +1,241 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  using Shape1 = Shape1_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm0 = typename Policy0::Operator::Shape;
+  using WarpGemm1 = typename Policy1::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount0 = GemmShape<Shape0::kM / WarpGemm0::kM,
+                               Shape0::kN / WarpGemm0::kN,
+                               Shape0::kK / WarpGemm0::kK>;
+  using WarpCount1 = GemmShape<Shape1::kM / WarpGemm1::kM,
+                               Shape1::kN / WarpGemm1::kN,
+                               Shape1::kK / WarpGemm1::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations0 =
+      (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 =
+      (WarpGemm1::kK / Operator1::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template<
+    typename Shape_,
+    typename Policy_
+  >
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Policy = Policy_;
+    using Operator = typename Policy::Operator;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+  using SharedStorage0 = SharedStorage<Shape0, Policy0>;
+  using SharedStorage1 = SharedStorage<Shape1, Policy1>;
+  union B2bMmaSharedStorage {
+    SharedStorage0 shared_storage0;
+    SharedStorage1 shared_storage1;
+  };
+
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A0 operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+
+  /// Iterator to load a warp-scoped tile of B1 operand from shared memory
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
new file mode 100644
index 000000000..931b10c70
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -0,0 +1,878 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_base.h"
+#include "classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+#include "classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator)
+    typename FragmentIteratorA1_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// WarpIterator to load Scale or Bias vector from threadblock fragment
+    typename FragmentIteratorA1ScaleBias_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of output matrix
+    typename ElementOutput_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    bool CausalMaskAfterGemm0,
+    typename WarpShape0_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaMultistage :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over intermediate accumulator tile
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from threadblock fragment
+  using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Data type of output matrix
+  using ElementOutput = ElementOutput_;
+ ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  static const int kPartitionsK0 = Shape0_::kK / WarpShape0_::kK;
+
+  using GmemToAccumLoader =
+      typename cutlass::epilogue::threadblock::DefaultGmemToAccumLoaderTensorOp<
+          Shape0_, Operator0, kPartitionsK0, OutputOp,
+          OutputOp::kCount>::GmemToAccumLoader;
+
+  using IteratorC0 = typename GmemToAccumLoader::OutputTileIterator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  GmemToAccumLoader gmem_to_accum_loader;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      typename GmemToAccumLoader::SharedStorage &bias_add_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
+      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+                              int group_start_A0 = 0, int group_start_B0 = 0) {
+    iterator_A0.set_iteration_index(group_start_A0 *
+                                   IteratorA0::kAccessesPerVector);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess /
+                              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0 *
+                                   IteratorB0::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
+                              int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(group_start_B1 *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A0 operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B0 operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over C0 operand in global memory
+      IteratorC0 iterator_C0,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0)
+    {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+      iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA0::Element>::value *
+              IteratorA0::ThreadMap::kElementsPerAccess /
+              IteratorA0::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A0.valid() ? kSrcBytes : 0);
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, iterator_A0.get(), iterator_A0.valid());
+
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, iterator_B0.get(), iterator_B0.valid());
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+    iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        warp_mma0(
+          accum0,
+          warp_transformed_frag_A0[warp_mma_k % 2],
+          warp_transformed_frag_B0[warp_mma_k % 2],
+          accum0
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+
+          group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+          iterator_A0.clear_mask(gemm_k_iterations_0 == 0);
+          iterator_B0.clear_mask(gemm_k_iterations_0 == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    // Apply bias add
+    gmem_to_accum_loader(output_op_0, accum0, iterator_C0);
+    __syncthreads();
+
+
+    // 2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+    typename FragmentIteratorA1::OutputOp noop_output_op_0({});
+    TriuMmaTensorOpFragmentIterator<FragmentIteratorA1, Shape0::kM> triu_warp_tile_iterator_A1_;
+
+    //
+    // Prologue
+    //
+    int gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_1) {
+
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], noop_output_op_0);
+    if (CausalMaskAfterGemm0) {
+      triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+      ++triu_warp_tile_iterator_A1_;
+    }
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+    //
+    // Mainloop
+    //
+
+    gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1 - (Base::kStages - 1);
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > (-Base::kStages + 1); gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment
+        warp_tile_iterator_A1_.load(
+            warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+            noop_output_op_0
+        );
+        if (CausalMaskAfterGemm0) {
+          triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          ++triu_warp_tile_iterator_A1_;
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+
+        warp_mma1(
+          accum,
+          warp_transformed_frag_A1[warp_mma_k % 2],
+          warp_transformed_frag_B1[warp_mma_k % 2],
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
new file mode 100644
index 000000000..c91309c75
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
@@ -0,0 +1,562 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator)
+  typename FragmentIteratorA1_,
+  /// Iterates over vectors of scale and bias vector in global memory
+  //  (concept: VectorIterator)
+  typename IteratorAccumulatorScaleBias_,
+  /// FragmentIterator to load Scale or Bias vector from threadblock fragment
+  typename FragmentIteratorA1ScaleBias_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of output matrix
+  typename ElementOutput_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element,
+    typename IteratorA0_::Element,
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element,
+    typename IteratorB0_::Element,
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element,
+    typename IteratorB1_::Element,
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelined :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
+  using FragmentIteratorA1ScaleBias =
+    FragmentIteratorA1ScaleBias_;     ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+
+  using ElementOutput = ElementOutput_;       ///< Data type of output matrix
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  /// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
+  using WarpFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
+
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    //These should stay the same across different GEMM layers
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorAccumulatorScaleBias iterator_A1_scale,    ///< iterator over A1 operand scale vectors in global memory
+    IteratorAccumulatorScaleBias iterator_A1_bias,     ///< iterator over A1 operand bias vectors in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    //2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentA1ScaleBias tb_frag_A1_scale;
+    FragmentA1ScaleBias tb_frag_A1_bias;
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
+    FragmentB1 tb_frag_B1;
+
+    if(PerChannelScale)
+        tb_frag_A1_scale.clear();
+    tb_frag_A1_bias.clear();
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    if(PerChannelScale)
+        iterator_A1_scale.load(tb_frag_A1_scale);
+    iterator_A1_bias.load(tb_frag_A1_bias);
+    iterator_B1.load(tb_frag_B1);
+
+    if(PerChannelScale)
+        ++iterator_A1_scale;
+    ++iterator_A1_bias;
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
+    WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    if(PerChannelScale)
+        warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
+    warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
+        warp_frag_A1_bias[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    if(PerChannelScale)
+        ++warp_tile_iterator_A1_scale_;
+    ++warp_tile_iterator_A1_bias_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Avoid reading out of bounds
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+
+          if(PerChannelScale) {
+              tb_frag_A1_scale.clear();
+              iterator_A1_scale.load(tb_frag_A1_scale);
+              ++iterator_A1_scale;
+            }
+            tb_frag_A1_bias.clear();
+            iterator_A1_bias.load(tb_frag_A1_bias);
+            ++iterator_A1_bias;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+
+        if(PerChannelScale)
+          warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
+            warp_frag_A1_scale[(warp_mma_k + 1) % 2],
+            warp_frag_A1_bias[(warp_mma_k + 1) % 2],
+            output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if(PerChannelScale)
+          ++warp_tile_iterator_A1_scale_;
+        ++warp_tile_iterator_A1_bias_;
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+          ++iterator_B1;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                  warp_frag_B1[warp_mma_k % 2], accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h b/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
new file mode 100644
index 000000000..958cc8843
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
@@ -0,0 +1,858 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+* Customized version of Cutlass 3.1 cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+* removed problematic specialization of DefaultIteratorsTensorOp for fp16 -> fp32 accumulation
+* which had numeric issues due to the usage of SharedLoadIteratorMixed.
+* Introduces the cutlass::epilogue::threadblock::classic_b2b_bmm namespace, which is a customized
+* variant of the cutlass::epilogue::threadblock namespace.
+*
+**************************************************************************************************/
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+
+/*
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  float,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+
+};
+*/
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput,
+  int32_t,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+                "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace classic_b2b_bmm
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
new file mode 100644
index 000000000..abcc65025
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -0,0 +1,383 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "classic_b2b_bmm/threadblock/b2b_mma_pipelined.h"
+#include "classic_b2b_bmm/threadblock/b2b_mma_multistage.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C matrix
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Staging the accumulators in shared memory.
+    bool SmemAccumulator = false>
+struct DefaultB2bMma;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output with 2-stage pipeline
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type forAC matrix operand
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB1, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for output
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  // FragmentIteratorA1 should just load A1 fragments from the intermediate
+  // accumulator tile without modification, so LinearCombination is used to
+  // apply a no-op to the accumulator tile.
+  using LinearCombinationOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,
+    EpilogueOutputOp::kCount,
+    ElementAccumulator,
+    ElementC,
+    cutlass::epilogue::thread::ScaleType::Nothing
+  >;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, LinearCombinationOutputOp>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB1, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages,
+      CausalMaskAfterGemm0, typename MmaCore0::WarpShape>;
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
new file mode 100644
index 000000000..c40e6d43f
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
@@ -0,0 +1,202 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/default_epilogue_tensor_op.h but
+  modified to use GmemToAccumLoader, GmemToAccumLoaderFragmentIteratorTensorOp, and
+  GmemToAccumLoaderSharedLoadIterator.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "classic_b2b_bmm/threadblock/gmem_to_accum_loader.h"
+#include "classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h"
+#include "classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h"
+#include "classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultGmemToAccumLoaderTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::GmemToAccumLoaderFragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = classic_b2b_bmm::detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename cutlass::epilogue::threadblock::GmemToAccumLoaderSharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementOutput
+  >;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using GmemToAccumLoader = cutlass::epilogue::threadblock::GmemToAccumLoader<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
new file mode 100644
index 000000000..87d413344
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
@@ -0,0 +1,361 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/epilogue.h and modified to essentially
+  inverse the direction of the epilogue. See https://github.com/NVIDIA/cutlass/issues/784
+  for details.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/util/index_sequence.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class GmemToAccumLoader :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+  static_assert(kPartitionsK == 1, "Must be exactly 1.");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoader(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                ///< Output operator
+    AccumulatorTile &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration)
+    {
+
+      //
+      // Load fragments from shared memory
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+        // Load addend source fragment from global memory to aligned register fragment.
+        source_iterator.load(source_fragment);
+        ++source_iterator;
+
+        // Store data in register fragment to shared memory.
+        shared_load_iterator_.store(source_fragment);
+
+        if (p < Base::kFragmentsPerIteration - 1)
+        {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        typename OutputTileIterator::Fragment source_accum_fragment;
+        typename AccumulatorFragmentIterator::Fragment output_accum_fragment;
+
+        // Load from shared memory to "unaligned" accumulator fragment.
+        this->warp_tile_iterator_.load(source_accum_fragment);
+
+        // Load from accumulators to accumulator fragment.
+        accum_fragment_iterator.load(accum_fragment);
+
+        // Store result of computation to accumulators.
+        apply_output_operator(output_accum_fragment, output_op, accum_fragment, source_accum_fragment);
+        accum_fragment_iterator.store(output_accum_fragment);
+
+        ++accum_fragment_iterator;
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+    }
+
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator(
+    typename AccumulatorFragmentIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename AccumulatorFragmentIterator::Fragment const &accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment)
+  {
+
+    AccumulatorAccessType *output_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&accum_fragment);
+
+    OutputAccessType const *source_frag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations =
+      AccumulatorFragmentIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    NumericArrayConverter<typename AccumulatorAccessType::Element, typename OutputAccessType::Element, OutputOp::kCount, OutputOp::kRound> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i)
+    {
+      // Call the output operator
+      output_frag_ptr[i] = converter(output_op(compute_frag_ptr[i], source_frag_ptr[i]));
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
new file mode 100644
index 000000000..f5ecb1bc7
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
@@ -0,0 +1,274 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in
+/// GmemToAccumLoader.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    int MaxAlignment =
+        ThreadMap_::kElementsPerAccess* sizeof_bits<Element_>::value / 8>
+class GmemToAccumLoaderSharedLoadIterator {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment =
+      (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType =
+      AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+      Element,
+      const_min(
+          128 / sizeof_bits<Element>::value,
+          ThreadMap::kElementsPerAccess),
+      const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess =
+      AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoaderSharedLoadIterator(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t*>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ += thread_offset.row() * stride_ +
+        thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    byte_pointer_ += offset.row() * Shape::kRow * stride_ +
+        offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t const* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+          LoadType const* memory_pointer =
+              reinterpret_cast<LoadType const*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer
+                  [(column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                       kLoadsPerAccess +
+                   v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment from memory.
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(const Fragment& frag, Index pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType const* frag_ptr = reinterpret_cast<LoadType const*>(&frag);
+          LoadType* memory_pointer = reinterpret_cast<LoadType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int memory_pointer_idx =
+                  (column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                      kLoadsPerAccess +
+                  v;
+              memory_pointer[memory_pointer_idx] =
+                  frag_ptr[frag_idx * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_DEVICE
+  void store(const Fragment& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h b/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
new file mode 100644
index 000000000..3fb47da01
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
@@ -0,0 +1,315 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+
+      NOTE: Copied from cutlass/epilogue/warp/fragment_iterator_tensor_op.h but modified
+      to make the accumulators non-const type so the accumulators can be modified.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC,
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile &accum):
+    accumulators_(reinterpret_cast<AccessType *>(&accum)),
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset =
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+
+  /// Stores a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void store(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = frag_ptr[n];
+    }
+  }
+
+  /// Adds a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void add(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = accumulators_[accumulator_access_offset] + frag_ptr[n];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h b/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
new file mode 100644
index 000000000..7b6903486
--- /dev/null
+++ b/static/include/kernels/classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,235 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "classic_b2b_bmm/thread/linear_combination_triu.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Modified version of MmaTensorOpFragmentIterator that can zero out upper triangular
+// portion of output matrix.
+template <typename MmaTensorOpFragmentIterator_, int ThreadBlockShapeM_>
+class TriuMmaTensorOpFragmentIterator {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = typename MmaTensorOpFragmentIterator_::Shape;
+
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = typename MmaTensorOpFragmentIterator_::AccumulatorShape;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = MmaTensorOpFragmentIterator_::kKBlockColumn;
+
+  /// Accumulator Element type
+  using ElementAccumulator = typename MmaTensorOpFragmentIterator_::ElementAccumulator;
+
+  /// Element type
+  using Element = typename MmaTensorOpFragmentIterator_::Element;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = typename MmaTensorOpFragmentIterator_::InstructionShape;
+
+  /// Output operation on fragment
+  using OutputOp = thread::LinearCombinationTriu<
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    MmaTensorOpFragmentIterator_::OutputOp::kCount,
+    ThreadBlockShapeM_,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementCompute
+  >;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow,
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+  OutputOp output_op;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator()
+      : index_(0), is_residual_tile_(true), output_op() {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset;
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        if(!(is_residual_tile_ && index_ >= kResidualIndex)) {
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(
+              frag_ptr[m * MmaIterations::kColumn + n],
+              index_,
+              n,
+              m
+            );
+        }
+      }
+    }
+  }
+
+};
+
+}
+}
+}
diff --git a/static/include/kernels/debug_string.h b/static/include/kernels/debug_string.h
new file mode 100644
index 000000000..e6b897cae
--- /dev/null
+++ b/static/include/kernels/debug_string.h
@@ -0,0 +1,202 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+#include <cutlass/cutlass.h>
+#include "kat_printf.h"
+
+// Helper functions for debug logging, these
+// make it easier to create meaningful debug log entries, especially
+// from within CUDA code.
+template <size_t SIZE = 255>
+struct DebugString {
+  char buffer[SIZE + 1];
+  const size_t size;
+  size_t pos;
+
+  CUTLASS_HOST_DEVICE DebugString() : size{SIZE}, pos{0} {
+    buffer[size] = '\0';
+    buffer[0] = '\0';
+  }
+
+  CUTLASS_HOST_DEVICE void reset() {
+    pos = 0;
+    buffer[size] = '\0';
+    buffer[0] = '\0';
+  }
+
+  CUTLASS_HOST_DEVICE void terminate() {
+    if (pos < size) {
+      buffer[pos] = '\0';
+    }
+  }
+
+  CUTLASS_DEVICE int snprintf(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    int ret = 0;
+    if (size > pos) {
+      ret = kat::vsnprintf(buffer + pos, size - pos - 1, format, args);
+      pos += ret;
+      if (pos >= size) {
+        pos = size - 1;
+      }
+    }
+    va_end(args);
+    this->terminate();
+    return ret;
+  }
+
+  CUTLASS_DEVICE int append_str(const char* str) {
+    int spos = 0;
+    while ((str[spos] != '\0') && (pos < size - 1)) {
+      buffer[pos++] = str[spos++];
+    }
+    this->terminate();
+    return spos;
+  }
+
+  CUTLASS_DEVICE int append_str(const char* str, int max_len) {
+    int spos = 0;
+    while ((str[spos] != '\0') && (pos < size - 1)) {
+      buffer[pos++] = str[spos++];
+      if (spos >= max_len) {
+        break;
+      }
+    }
+    this->terminate();
+    return spos;
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array(T* arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>(arr[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array(T* arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>(arr[i]));
+    }
+    this->append_str("]");
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array(T& arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>(arr[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array(T& arr, int start, int n) {
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>(arr[i]));
+    }
+    this->append_str("]");
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_float_array_from_ptr_to_array(
+      T* arr,
+      int start,
+      int n) {
+    // cutlass TileAccessIterator.get returns a pointer to a cutlass::Array,
+    // which cannot be passed to the above functions without copying
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%f", static_cast<float>((*arr)[i]));
+    }
+    this->append_str("]");
+    this->terminate();
+  }
+
+  template <typename T>
+  CUTLASS_DEVICE void append_int_array_from_ptr_to_array(
+      T* arr,
+      int start,
+      int n) {
+    // cutlass TileAccessIterator.get returns a pointer to a cutlass::Array,
+    // which cannot be passed to the above functions without copying
+    this->append_str("[");
+    for (std::size_t i = start; i < start + n; ++i) {
+      if (i != start)
+        this->append_str(", ");
+      this->snprintf("%d", static_cast<int>((*arr)[i]));
+    }
+    this->append_str("]");
+  }
+
+  CUTLASS_DEVICE
+  void append_threadinfo() {
+    this->snprintf("thread(%d/%d", (int)threadIdx.x, (int)blockDim.x);
+    if ((blockDim.y > 1) or (blockDim.z > 1)) {
+      this->snprintf(
+          ", %d/%d, %d/%d", threadIdx.y, blockDim.y, threadIdx.z, blockDim.z);
+    }
+    this->snprintf(") grid=(%d/%d", blockIdx.x, gridDim.x);
+    if ((gridDim.y > 1) or (gridDim.z > 1)) {
+      this->snprintf(
+          ", %d/%d, %d/%d)", blockIdx.y, gridDim.y, blockIdx.z, gridDim.z);
+    }
+    this->append_str(")");
+  }
+
+  template <typename... Args>
+  CUTLASS_DEVICE void append_arg_types(Args... args) {
+    const char* pretty = __PRETTY_FUNCTION__; // special compiler-defined macro
+
+    const char* start = kat::strchr(pretty, '[');
+    const char* end = kat::strrchr(pretty, ']');
+    size_t len = end - start;
+    len = (len > size - pos) ? size - pos : len;
+    this->append_str(start, len);
+  }
+
+  template <typename... Args>
+  CUTLASS_DEVICE void append_types() {
+    const char* pretty = __PRETTY_FUNCTION__; // special compiler-defined macro
+
+    const char* start = kat::strchr(pretty, '[');
+    const char* end = kat::strrchr(pretty, ']');
+    size_t len = end - start;
+    len = (len > size - pos) ? size - pos : len;
+    this->append_str(start, len);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void println() {
+    printf("%s\n", buffer);
+  }
+};
diff --git a/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h b/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h
new file mode 100644
index 000000000..170fa894a
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/attention_scaling_coefs_updater.h
@@ -0,0 +1,384 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Mostly copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+
+/* Iterates on the accumulator and corresponding position on result matrix
+
+All of this is done on registers, before we store all of this
+on shared memory for the next matmul with Value.
+
+We have multiple implementations, because each configuration has a different way
+of iterating in the accumulators.
+*/
+
+template <typename BASE, typename T, typename accum_t, int kWarpSize>
+struct RegisterOps {};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSm80
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSm80<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterVolta
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterVolta<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSimt
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSimt<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using Iterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSimt<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Updater =
+      AttentionScalingCoefsUpdaterVolta<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSm80<Iterator, accum_t, kWarpSize>;
+};
diff --git a/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h b/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h
new file mode 100644
index 000000000..90766c775
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/debug_utils.h
@@ -0,0 +1,201 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+#include <float.h>
+#include <stdio.h>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (int _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 1
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_T0_L0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_TX_LX(msg, ...)                                                 \
+  for (int bx = 0; bx < gridDim.x; ++bx) {                                    \
+    for (int by = 0; by < gridDim.y; ++by) {                                  \
+      for (int bz = 0; bz < gridDim.z; ++bz) {                                \
+        for (int tx = 0; tx < blockDim.x; ++tx) {                             \
+          for (int ty = 0; ty < blockDim.y; ++ty) {                           \
+            for (int tz = 0; tz < blockDim.z; ++tz) {                         \
+              __syncthreads();                                                \
+              if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \
+                  threadIdx.x == tx && threadIdx.y == ty &&                   \
+                  threadIdx.z == tz) {                                        \
+                printf(                                                       \
+                    "[%d,%d,%d][%d,%d,%d]" msg "\n",                          \
+                    bx,                                                       \
+                    by,                                                       \
+                    bz,                                                       \
+                    tx,                                                       \
+                    ty,                                                       \
+                    tz,                                                       \
+                    ##__VA_ARGS__);                                           \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+#else
+#define PRINT_T0_L0
+#define PRINT_TX_LX
+#endif
+
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+#if __cplusplus >= 201402L
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+template <class T>
+constexpr __string_view __get_type_name() {
+  return {"unsupported", 11};
+}
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_T0_L0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_T0_L0("printing %s (%s)", name, typeStr.data);      \
+    for (int _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_T0_L0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_T0_L0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_T0_L0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
diff --git a/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h b/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h
new file mode 100644
index 000000000..6d36dc8b1
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/epilogue_pipelined.h
@@ -0,0 +1,635 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h b/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h
new file mode 100644
index 000000000..73a7aee9a
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/epilogue_rescale_output.h
@@ -0,0 +1,267 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+#include "fmha_style_b2b_bmm/epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h b/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h
new file mode 100644
index 000000000..a39d1956e
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/find_default_mma.h
@@ -0,0 +1,193 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instanciate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+
+    Copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h b/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h
new file mode 100644
index 000000000..70f1883f6
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/gemm_kernel_utils.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                           \
+  {                                                                            \
+    if (query.scalar_type() == at::ScalarType::Float) {                        \
+      using scalar_t = float;                                                  \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::Half) {                  \
+      using scalar_t = cutlass::half_t;                                        \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {              \
+      using scalar_t = cutlass::bfloat16_t;                                    \
+      func();                                                                  \
+    } else {                                                                   \
+      XFORMERS_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                          \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      constexpr bool BOOL_NAME = true;      \
+      F();                                  \
+    } else {                                \
+      constexpr bool BOOL_NAME = false;     \
+      F();                                  \
+    }                                       \
+  }
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      XFORMERS_CHECK(                                                     \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  XFORMERS_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)   \
+  if (!(COND)) {                    \
+    std::cerr << #COND " failed\n"; \
+    return false;                   \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    XFORMERS_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) {
+  return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value) {
+  return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000..d09e86727
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,753 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h
new file mode 100644
index 000000000..bfabc5875
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/make_residual_last.h
@@ -0,0 +1,103 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h"
+#include "fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000..8c41720d8
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2118 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+    this iterator visits maybe partial, then the remaining tiles are complete.
+    So, we only need to compute the predicates twice, once before the first tile
+    and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+
+    Copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000..53a7fc6a3
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2123 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Templates implementing loading of tiles from pitch-linear rank=2
+  tensors.
+
+  This iterator uses masks to guard out-of-bounds accesses. The first tile
+  this iterator visits maybe partial, then the remaining tiles are complete.
+  So, we only need to compute the predicates twice, once before the first tile
+  and once for the remaining full tiles which can share the same predicates.
+
+  A precomputed "Params" object minimizes the amount of state that must be
+  stored in registers, and integer addition is used to advance the pointer
+  through memory.
+
+  Copied from
+  http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h
new file mode 100644
index 000000000..cbf917afa
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h
@@ -0,0 +1,59 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#pragma once
+
+#include "fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+  using Iterator = char;
+  static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, kTranspose>> {
+  using Iterator =
+      cutlass::gemm::warp::WarpIteratorFromSmem<Operand, Element, !kTranspose>;
+  static bool constexpr kSupportsTranspose = true;
+};
diff --git a/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h b/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h
new file mode 100644
index 000000000..1d77f44d2
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,281 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+
+    Mostly copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = cutlass::MatrixShape<32, 32>;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+
+  /// Basic check
+  static_assert(
+      kOperand == Operand::kA || kOperand == Operand::kB,
+      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = cutlass::MatrixShape<16, 8>;
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = 1;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess =
+      (sizeof_bits<Element>::value >= 32 ? 1
+                                         : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+      Shape::kRow / InstructionShape::kRow,
+      Shape::kColumn / InstructionShape::kColumn>;
+
+  static int const kIterations = (kOperand == Operand::kA)
+      ? InstructionCount::kColumn
+      : InstructionCount::kRow;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+      Element,
+      (kOperand == Operand::kA)
+          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+  /// Memory access type
+  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  using AccessType = Array<unsigned, 4>;
+
+  static int constexpr kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn
+                               : InstructionShape::kRow);
+  static int constexpr kAccessesInner =
+      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+
+ private:
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+ public:
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+      : ref_(ref), iterations_(0) {
+    int ldsm_vec_num = (lane_id >> 3);
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id % 8, 0);
+      static_assert(
+          InstructionCount::kRow * kAccessesInner * kTilesPerInstruction == 4,
+          "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_m_idx = 0; inst_m_idx < InstructionCount::kRow;
+           ++inst_m_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int access_m_idx = 0; access_m_idx < kTilesPerInstruction;
+               ++access_m_idx) {
+            int access_idx = access_m_idx +
+                kTilesPerInstruction *
+                    (inner_idx + kAccessesInner * inst_m_idx);
+
+            MatrixCoord offset(
+                access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+                inner_idx * 4 * kElementsPerAccess);
+
+            if (access_idx == ldsm_vec_num) {
+              if (kTranspose) {
+                offset = MatrixCoord(offset.column(), offset.row());
+              }
+              origin_ += offset;
+            }
+          }
+        }
+      }
+    } else {
+      origin_ = MatrixCoord(0, lane_id % 8);
+      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
+           ++inst_n_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+          if (access_idx == ldsm_vec_num) {
+            if (kTranspose) {
+              offset = MatrixCoord(offset.column(), offset.row());
+            }
+            origin_ += offset;
+          }
+        }
+      }
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
+    TensorCoord coord_offset(
+        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    if (kTranspose) {
+      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
+    }
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    } else {
+      add_tile_offset({1, 0});
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& operator++() {
+    iterations_++;
+
+    if (iterations_ >= kIterations)
+      advance();
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+    using LoadLayout = typename platform::
+        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+    MatrixCoord offset;
+    if (kOperand == Operand::kA) {
+      offset = MatrixCoord(0, iterations_ * InstructionShape::kColumn);
+    } else {
+      offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+    }
+    if (kTranspose) {
+      offset = MatrixCoord(offset.column(), offset.row());
+    }
+    cutlass::arch::ldsm<LoadLayout, 4>(
+        access_ptr[0], ref_.data() + ref_.offset(offset));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
new file mode 100644
index 000000000..47396ed60
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/kernel_forward.h
@@ -0,0 +1,882 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * This implementation is adapted from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "fmha_style_b2b_bmm/debug_utils.h"
+#include "fmha_style_b2b_bmm/epilogue_pipelined.h"
+#include "fmha_style_b2b_bmm/epilogue_rescale_output.h"
+#include "fmha_style_b2b_bmm/find_default_mma.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+#include "fmha_style_b2b_bmm/mma_from_smem.h"
+#include "fmha_style_b2b_bmm/transform/tile_smem_loader.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+} // namespace
+
+template <
+    // The datatype of Q/K/V/output
+    typename scalar_t_,
+    // The datatype for accumulation
+    typename accum_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    // = `value.shape[-1] <= kKeysPerBlock`
+    bool kSingleValueIteration,
+    // Activation functor
+    template <typename T>
+    class ActivationFunctor,
+    typename offset_t_ = int64_t>
+struct AttentionKernel {
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  using output_accum_t = accum_t;
+  using offset_t = offset_t_;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
+      cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
+    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+
+    // Output tensors
+    output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
+    output_accum_t*
+        output_accum_ptr; // [num_queries, num_heads, head_dim_value]
+
+    // Scale
+    accum_t scale;
+    accum_t activation_scale;
+    bool activation_scale_divide_by_seq_len{false};
+
+    // Dimensions/strides
+    int32_t head_dim;
+    int32_t head_dim_value;
+    int32_t seq_length;
+    int32_t num_queries;
+    int32_t num_keys;
+    int32_t num_batches;
+    int32_t num_heads;
+
+    // When offset_ptr is not null, support variable sequence length.
+    // offset is a vector of offset of sequence length per batch.
+    // len(offset) = batch_size + 1
+    // In this case, seq_length / num_queries / num_keys are max_seq_length.
+    // offset is applied to queries, keys and values.
+    //
+    // e.g. If input tensor shape is:
+    // num_batches = 3
+    // batch0: seq_length_0=4, H, head_dim
+    // batch1: seq_length_0=2, H, head_dim
+    // batch2: seq_length_0=10, H, head_dim
+    // offset=[0, 4, 6, 16]
+    const offset_t* offset_ptr;
+
+    enum CausalType {
+      NO_CAUSAL = 0,
+      UPPER_RIGHT_EMPTY = 1,
+      LOWER_LEFT_EMPTY = 2
+    };
+    CausalType causal_type;
+
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+    int32_t bias_strideM;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int32_t bias_strideH;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int32_t bias_strideB;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+      int64_t q_start = 0;
+      int64_t k_start = 0;
+
+      // Advance to current batch - in case of different sequence lengths
+      if (offset_ptr) {
+        auto start = offset_ptr[batch_id];
+        auto end = offset_ptr[batch_id + 1];
+        q_start = static_cast<int64_t>(start);
+        k_start = static_cast<int64_t>(start);
+        auto actual_seq_length = static_cast<int32_t>(end - start);
+        num_queries = actual_seq_length;
+        num_keys = actual_seq_length;
+        if (query_start >= num_queries) {
+          return false;
+        }
+      } else {
+        query_ptr += batch_id * q_strideB;
+        key_ptr += batch_id * k_strideB;
+        value_ptr += batch_id * v_strideB;
+        output_ptr += int64_t(batch_id * num_queries) * o_strideM();
+        if (output_accum_ptr != nullptr) {
+          output_accum_ptr += int64_t(batch_id * num_queries) * o_strideM();
+        }
+        q_start = 0;
+        k_start = 0;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+      key_ptr += k_start * k_strideM + head_id * k_strideH;
+      value_ptr += k_start * v_strideM + head_id * v_strideH;
+      output_ptr += int64_t(q_start + query_start) * o_strideM() +
+          head_id * head_dim_value;
+      if (attn_bias_ptr != nullptr) {
+        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
+      }
+
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr += int64_t(q_start + query_start) * o_strideM() +
+            head_id * head_dim_value;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+      if (causal_type == CausalType::UPPER_RIGHT_EMPTY) {
+        num_keys = cutlass::fast_min(num_queries, num_keys);
+      }
+      num_queries -= query_start;
+      num_batches = 0; // no longer used after
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      attn_bias_ptr = warp_uniform(attn_bias_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we load bias
+      into shared_memory and then add it to registers, and apply scaling and
+      causal masks. We then store this value into a shared-memory
+      ("AccumulatorSharedStorage") that is used later as operand A for the
+      second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        DefaultConfig::kStages, // Should use `DefaultConfig::kStages`, but that
+                                // uses too much smem
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Updater;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // used for efficient load of bias tile Bij from global to shared memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the result from MM0
+      and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {};
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      // For some reason union doesn't work. Need more debugging.
+      // volatile union {
+      typename MM0::BiasLoader::SmemTile bias;
+      typename MM0::AccumulatorSharedStorage si;
+      // };
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      // union {
+      typename MM0::BiasLoader::SmemTile bias;
+      typename MM0::AccumulatorSharedStorage si;
+      // };
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0, "value is not correctly aligned");
+    XFORMERS_CHECK(
+        p.q_strideH % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideH % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideH % kAlignmentV == 0, "value is not correctly aligned");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+
+    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+    uint32_t key_start = (p.causal_type == Params::CausalType::LOWER_LEFT_EMPTY)
+        ? query_start
+        : 0;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM()},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)p.o_strideM()},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+    // Iterate through keys
+    for (int32_t iter_key_start = key_start; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1.mm,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_id();
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // load bias tile Bij into shared memory
+      typename MM0::BiasLoader::GmemTileIterator bias_iter(
+          {cutlass::layout::RowMajor(p.bias_strideM)},
+          // attn_bias_pointer points to matrix of size (n_queries, n_keys)
+          // for the relevant batch_id and head_id
+          p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+          {problem_size_0_m, problem_size_0_n},
+          thread_id());
+      cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+          shared_storage.after_mm0.bias.data(),
+          cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+      typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
+          bias_tensor_ref, thread_id());
+      if (p.attn_bias_ptr != nullptr) {
+        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
+      }
+
+      // apply scale, attention bias, activation_scale if applicable
+      // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+      auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+          lane_id(), warp_id(), iteratorC_tile_offset);
+      MM0::ScalingCoefsUpdater::iterateRows(
+          lane_offset,
+          [&](int accum_m) {},
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+              // int x = accum_m + query_start;
+              // int y = accum_n + iter_key_start;
+              accum[idx] = accum[idx] * p.scale;
+              if (p.attn_bias_ptr != nullptr) {
+                accum[idx] = accum[idx] +
+                    bias_tensor_ref.at(
+                        {(p.bias_strideM == 0 ? 0 : accum_m), accum_n});
+              }
+              accum[idx] = ActivationFunctor<accum_t>()(accum[idx]) *
+                  (accum_t)(p.activation_scale);
+              if (p.activation_scale_divide_by_seq_len) {
+                // Divide by max_seq_len instead of actual_seq_len.
+                // Might beed to be configured to use either max_seq_len or
+                // actual_seq_len in the future.
+                accum[idx] = accum[idx] / (accum_t)(p.seq_length);
+              }
+            } else {
+              // Need to set out-of-bound elements to 0 as these elements
+              // will also be used in the accum@V MMA.
+              accum[idx] = (accum_t)(0);
+            }
+          },
+          [&](int accum_m) {});
+
+      // Mask out last if causal
+      if (p.causal_type != Params::CausalType::NO_CAUSAL) {
+        int32_t last_col;
+        MM0::ScalingCoefsUpdater::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              last_col = query_start + accum_m - iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              switch (p.causal_type) {
+                case Params::CausalType::UPPER_RIGHT_EMPTY:
+                  // Need to set out-of-bound elements to 0 as these elements
+                  // will also be used in the accum@V MMA.
+                  if (accum_n > last_col) {
+                    accum[idx] = (accum_t)(0);
+                  }
+                  break;
+                case Params::CausalType::LOWER_LEFT_EMPTY:
+                  // Need to set out-of-bound elements to 0 as these elements
+                  // will also be used in the accum@V MMA.
+                  if ((accum_n < last_col && accum_m < problem_size_0_m) ||
+                      accum_m >= problem_size_0_m ||
+                      accum_n >= problem_size_0_n) {
+                    accum[idx] = (accum_t)(0);
+                  }
+                  break;
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            shared_storage.after_mm0.mm1.mm,
+            shared_storage.after_mm0.si,
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id(),
+            (int)problem_size_1_k);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          DISPATCH_BOOL(
+              iter_key_start == key_start, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp =
+                          typename cutlass::epilogue::thread::LinearCombination<
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  output_t,
+                                  output_accum_t>::type,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              (kIsFirst ? cutlass::epilogue::thread::ScaleType::
+                                              Nothing
+                                        : cutlass::epilogue::thread::ScaleType::
+                                              NoBetaScaling),
+                              cutlass::FloatRoundStyle::round_to_nearest,
+                              output_accum_t>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp epilogue_op({});
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          warp_id(),
+                          lane_id());
+                      epilogue(epilogue_op, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    if (kKeepOutputInRF) {
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      auto dest_iter = createOutputIter(0);
+      DefaultOp epilogue_op({});
+      DefaultEpilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      epilogue(epilogue_op, dest_iter, accum_o);
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
diff --git a/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h b/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h
new file mode 100644
index 000000000..366420278
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/mma_from_smem.h
@@ -0,0 +1,1691 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    Mostly copied from
+    http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "fmha_style_b2b_bmm/attention_scaling_coefs_updater.h"
+#include "fmha_style_b2b_bmm/gemm_kernel_utils.h"
+#include "fmha_style_b2b_bmm/iterators/make_residual_last.h"
+#include "fmha_style_b2b_bmm/iterators/transpose_warp_iterator.h"
+#include "fmha_style_b2b_bmm/iterators/warp_iterator_from_smem.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA =
+      TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         AccumulatorSharedStorage::Shape::kN,
+                                         Policy_,
+                                         2> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy_,
+      2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx, ///< ID of each thread within a warp
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async tranfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory
+    : public MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<Shape1_, kMaxK_, Policy1_, Stages_>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(
+            accumulator_shared_storage.accum_ref(),
+            lane_idx),
+        smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        warp_loaded_frag_A1[0],
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              warp_loaded_frag_A1[warp_mma_k % 2],
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy,
+    typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+  using WarpShape = cutlass::MatrixShape<32, 32>;
+
+  using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem<
+      cutlass::gemm::Operand::kA,
+      typename RegularWarpIterator::Element>;
+};
+
+// TensorOp - Ampere half
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+        Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <
+    typename Mma_,
+    typename AccumulatorSharedStorage,
+    bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    AccumulatorSharedStorage_,
+    kTransposeA> {
+  static constexpr int kWarpSize = 32;
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  static constexpr bool kIsTransposedA = false;
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      AccumulatorSharedStorage_,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    AccumulatorSharedStorage_,
+    kTransposeA> {
+  static constexpr int kWarpSize = 32;
+
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorA_ = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+  static constexpr bool kIsTransposedA =
+      WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+  using WarpIteratorA = typename platform::conditional<
+      kIsTransposedA,
+      typename WarpIteratorTranspose::Iterator,
+      WarpIteratorA_>::type;
+
+  static int constexpr kMaxK = kIsTransposedA
+      ? AccumulatorSharedStorage_::Shape::kM
+      : AccumulatorSharedStorage_::Shape::kN;
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          AccumulatorSharedStorage_,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages,
+          kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using SmemIteratorD0 = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+      WarpShape,
+      cutlass::gemm::GemmShape<32, 32, 4>,
+      scalar_t,
+      SmemAccumulatorLayout>;
+
+  // // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+              16,
+              32>, // typename SmemIteratorD0::TensorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  using OutputLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h b/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h
new file mode 100644
index 000000000..5faded20f
--- /dev/null
+++ b/static/include/kernels/fmha_style_b2b_bmm/transform/tile_smem_loader.h
@@ -0,0 +1,94 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*
+ * Copied from
+ * http://github.com/NVIDIA/cutlass/tree/master/examples/41_fused_multi_head_attention
+ */
+
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <
+    typename scalar_t, // scalar type
+    typename ThreadblockTileShape, // size of tile to load
+    int Threads, // number of participating threads
+    int ElementsPerAccess> // thread access width in elements
+class TileSmemLoader {
+ public:
+  using SmemTile =
+      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+      cutlass::layout::PitchLinearShape<
+          ThreadblockTileShape::kColumn, // contiguous
+          ThreadblockTileShape::kRow>, // strided
+      Threads, // Threads
+      ElementsPerAccess>; // ElementsPerAccess
+
+  using GmemTileIterator =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          ThreadblockTileShape, // Shape
+          scalar_t, // Element
+          cutlass::layout::RowMajor, // Layout
+          0, // AdvanceRank
+          ThreadMap>; // ThreadMap
+
+  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
+      ThreadblockTileShape, // Shape
+      scalar_t, // Element
+      cutlass::layout::RowMajor, // Layout
+      0, // AdvanceRank
+      ThreadMap>; // ThreadMap
+
+  using Fragment = typename GmemTileIterator::Fragment;
+
+  /// load a tile from global memory into shared memory
+  CUTLASS_DEVICE
+  static void load(
+      GmemTileIterator tile_load_iter,
+      SmemTileIterator tile_store_iter) {
+    Fragment tb_frag;
+    tb_frag.clear();
+    tile_load_iter.load(tb_frag);
+    tile_store_iter.store(tb_frag);
+
+    __syncthreads();
+  }
+};
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
new file mode 100644
index 000000000..87b903a44
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/device/b2b_batched_gemm.h
@@ -0,0 +1,453 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+
+#include "grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C and D matrix operands
+    typename ElementC_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_ = ElementC_,
+    /// Operator class tag
+    typename OperatorClass_ = arch::OpClassSimt,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_ = arch::Sm70,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::EpilogueOutputOp,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle_ = threadblock::GemmBatchedIdentityThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kStages,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0 = false,
+    /// Stage accumulator in shared memory
+    bool SmemAccumulator = false,
+    /// Element type for offsets / array indices
+    typename offset_t_ = int32_t,
+    /// Access granularity of A matrix in units of elements
+    int AlignmentA =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentA,
+    /// Access granularity of B matrix in units of elements
+    int AlignmentB =
+        DefaultGemmConfiguration<OperatorClass_, ArchTag_, ElementA_, ElementB_,
+                                 ElementC_, ElementAccumulator_>::kAlignmentB,
+    /// Operation performed by GEMM
+    typename Operator_ = typename DefaultGemmConfiguration<
+        OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_,
+        ElementAccumulator_>::Operator
+>
+class GroupedB2bGemmBatched {
+ public:
+  using offset_t = offset_t_;
+  using ElementA = ElementA_;
+  using LayoutA = LayoutA_;
+  using TensorRefA = TensorRef<ElementA const, LayoutA>;
+  using ElementB = ElementB_;
+  using LayoutB = LayoutB_;
+  using LayoutB1 = LayoutB1_;
+  using TensorRefB = TensorRef<ElementB const, LayoutB>;
+  using ElementC = ElementC_;
+  using LayoutC = LayoutC_;
+  using TensorRefC = TensorRef<ElementC const, LayoutC>;
+  using TensorRefD = TensorRef<ElementC, LayoutC>;
+  using ElementAccumulator = ElementAccumulator_;
+  using OperatorClass = OperatorClass_;
+  using ArchTag = ArchTag_;
+  using ThreadblockShape0 = ThreadblockShape0_;
+  using ThreadblockShape1 = ThreadblockShape1_;
+  using WarpShape0 = WarpShape0_;
+  using WarpShape1 = WarpShape1_;
+  using InstructionShape = InstructionShape_;
+  using EpilogueOutputOp0 = EpilogueOutputOp0_;
+  using EpilogueOutputOp1 = EpilogueOutputOp1_;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using Operator = Operator_;
+  static int const kStages = Stages;
+  static int const kAlignmentA = AlignmentA;
+  static int const kAlignmentB = AlignmentB;
+  static int const kAlignmentC = EpilogueOutputOp1::kCount;
+  static bool const kCausalMaskAfterGemm0 = CausalMaskAfterGemm0;
+
+  /// Derived types
+  using ElementScaleBias = typename EpilogueOutputOp0::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor;
+
+  /// Define the kernel
+  using GroupedB2bGemmBatchedKernel = typename kernel::DefaultGroupedB2bGemmBatched<
+    ElementA,
+    LayoutA,
+    kAlignmentA,
+    ElementB,
+    LayoutB,
+    kAlignmentB,
+    LayoutB1,
+    ElementC,
+    LayoutC,
+    ElementAccumulator,
+    OperatorClass,
+    ArchTag,
+    ThreadblockShape0,
+    ThreadblockShape1,
+    WarpShape0,
+    WarpShape1,
+    InstructionShape,
+    EpilogueOutputOp0,
+    EpilogueOutputOp1,
+    ThreadblockSwizzle,
+    kStages,
+    Operator,
+    CausalMaskAfterGemm0,
+    SmemAccumulator,
+    offset_t
+  >::GroupedB2bGemmBatchedKernel;
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord problem_size_0;
+    GemmCoord problem_size_1;
+    TensorRef<ElementA const, LayoutA> ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    TensorRef<ElementB const, LayoutB> ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    TensorRef<ElementC const, LayoutC> ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    TensorRef<ElementB const, LayoutB1> ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    TensorRef<ElementC const, LayoutC> ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    TensorRef<ElementC, LayoutC> ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename EpilogueOutputOp0::Params epilogue0;
+    typename EpilogueOutputOp1::Params epilogue1;
+    // array of jagged dim offsets
+    // of size batch_count + 1
+    const offset_t *offsets;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments() {
+
+    }
+
+    /// Constructs an Arguments structure
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord problem_size_0_,
+      GemmCoord problem_size_1_,
+      TensorRef<ElementA const, LayoutA> ref_A0_,
+      int64_t head_stride_A0_,
+      int64_t batch_stride_A0_,
+      TensorRef<ElementB const, LayoutB> ref_B0_,
+      int64_t head_stride_B0_,
+      int64_t batch_stride_B0_,
+      TensorRef<ElementC const, LayoutC> ref_C0_,
+      int64_t head_stride_C0_,
+      int64_t batch_stride_C0_,
+      TensorRef<ElementB const, LayoutB1> ref_B1_,
+      int64_t head_stride_B1_,
+      int64_t batch_stride_B1_,
+      TensorRef<ElementC const, LayoutC> ref_C1_,
+      int64_t head_stride_C1_,
+      int64_t batch_stride_C1_,
+      TensorRef<ElementC, LayoutC> ref_D1_,
+      int64_t head_stride_D1_,
+      int64_t batch_stride_D1_,
+      int batch_count_,
+      int num_heads_,
+      const offset_t *offsets_,
+      typename EpilogueOutputOp0::Params epilogue0_ =
+        typename EpilogueOutputOp0::Params(),
+      typename EpilogueOutputOp1::Params epilogue1_ =
+        typename EpilogueOutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0_),
+      problem_size_1(problem_size_1_),
+      ref_A0(ref_A0_),
+      head_stride_A0(head_stride_A0_),
+      batch_stride_A0(batch_stride_A0_),
+      ref_B0(ref_B0_),
+      head_stride_B0(head_stride_B0_),
+      batch_stride_B0(batch_stride_B0_),
+      ref_C0(ref_C0_),
+      head_stride_C0(head_stride_C0_),
+      batch_stride_C0(batch_stride_C0_),
+      ref_B1(ref_B1_),
+      head_stride_B1(head_stride_B1_),
+      batch_stride_B1(batch_stride_B1_),
+      ref_C1(ref_C1_),
+      head_stride_C1(head_stride_C1_),
+      batch_stride_C1(batch_stride_C1_),
+      ref_D1(ref_D1_),
+      head_stride_D1(head_stride_D1_),
+      batch_stride_D1(batch_stride_D1_),
+      batch_count(batch_count_),
+      num_heads(num_heads_),
+      offsets(offsets_),
+      epilogue0(epilogue0_),
+      epilogue1(epilogue1_) {
+    }
+  };
+
+private:
+
+  /// Kernel parameters object
+  typename GroupedB2bGemmBatchedKernel::Params params_;
+
+public:
+
+  /// Constructs the GEMM.
+  GroupedB2bGemmBatched() { }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status can_implement(Arguments const &args) {
+
+    Status status = GroupedB2bGemmBatchedKernel::can_implement(
+      args.problem_size_0,
+      args.problem_size_1,
+      args.ref_A0.non_const_ref(),
+      args.ref_B0.non_const_ref(),
+      args.ref_C0.non_const_ref(),
+      args.ref_B1.non_const_ref(),
+      args.ref_C1.non_const_ref(),
+      args.ref_D1
+    );
+
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const &args) {
+    return 0;
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+
+    // Determine grid shape
+    ThreadblockSwizzle threadblock_swizzle;
+
+    cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape(
+      args.problem_size_0,
+      {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK},
+      args.batch_count * args.num_heads);
+    // Initialize the Params structure
+    params_ = typename GroupedB2bGemmBatchedKernel::Params{
+      args.problem_size_0,
+      args.problem_size_1,
+      grid_shape,
+      args.ref_A0.non_const_ref(),
+      args.head_stride_A0,
+      args.batch_stride_A0,
+      args.ref_B0.non_const_ref(),
+      args.head_stride_B0,
+      args.batch_stride_B0,
+      args.ref_C0.non_const_ref(),
+      args.head_stride_C0,
+      args.batch_stride_C0,
+      args.ref_B1.non_const_ref(),
+      args.head_stride_B1,
+      args.batch_stride_B1,
+      args.ref_C1.non_const_ref(),
+      args.head_stride_C1,
+      args.batch_stride_C1,
+      args.ref_D1,
+      args.head_stride_D1,
+      args.batch_stride_D1,
+      args.batch_count,
+      args.num_heads,
+      args.offsets,
+      args.epilogue0,
+      args.epilogue1
+    };
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+
+    params_.ref_A0.reset(args.ref_A0.non_const_ref().data());
+    params_.ref_B0.reset(args.ref_B0.non_const_ref().data());
+    params_.ref_C0.reset(args.ref_C0.non_const_ref().data());
+    params_.ref_B1.reset(args.ref_B1.non_const_ref().data());
+    params_.ref_C1.reset(args.ref_C1.non_const_ref().data());
+    params_.ref_D1.reset(args.ref_D1.data());
+    params_.output_op_0 = args.epilogue0;
+    params_.output_op_1 = args.epilogue1;
+    params_.offsets = args.offsets;
+
+    return Status::kSuccess;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    ThreadblockSwizzle threadblock_swizzle;
+
+    dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape);
+    dim3 block(GroupedB2bGemmBatchedKernel::kThreadCount, 1, 1);
+
+    cudaError_t result;
+
+    int smem_size = int(sizeof(typename GroupedB2bGemmBatchedKernel::SharedStorage));
+    if (smem_size >= (48 << 10)) {
+      result = cudaFuncSetAttribute(Kernel<GroupedB2bGemmBatchedKernel>,
+                                    cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                    smem_size);
+
+      if (result != cudaSuccess) {
+        return Status::kErrorInternal;
+      }
+    }
+
+    cutlass::Kernel<GroupedB2bGemmBatchedKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) {
+    return run(stream);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+    Arguments const &args,
+    void *workspace = nullptr,
+    cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+      status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
new file mode 100644
index 000000000..8d34c6689
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h
@@ -0,0 +1,459 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+    This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include <cstddef>
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename B2bMma_,               ///! Threadblock-scoped matrix multiply-accumulate
+  typename Epilogue_,             ///! Epilogue
+  typename ThreadblockSwizzle_,   ///! Threadblock swizzling function
+  typename GmemToAccumLoader_,
+  typename offset_t_
+>
+struct GroupedB2bGemmBatched {
+
+  using B2bMma = B2bMma_;
+  using Epilogue = Epilogue_;
+  using GmemToAccumLoader = GmemToAccumLoader_;
+  using OutputOp0 = typename B2bMma::OutputOp;
+  using OutputOp1 = typename Epilogue::OutputOp;
+  using ThreadblockSwizzle = ThreadblockSwizzle_;
+  using offset_t = offset_t_;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount0 = typename B2bMma::WarpCount0;
+  static int const kThreadCount = 32 * WarpCount0::kCount;
+
+  /// Parameters structure
+  struct Params {
+    cutlass::gemm::GemmCoord problem_size_0;
+    cutlass::gemm::GemmCoord problem_size_1;
+    cutlass::gemm::GemmCoord grid_tiled_shape;
+    int swizzle_log_tile;
+    typename B2bMma::IteratorA0::Params params_A0;
+    typename B2bMma::IteratorA0::TensorRef ref_A0;
+    int64_t head_stride_A0;
+    int64_t batch_stride_A0;
+    typename B2bMma::IteratorB0::Params params_B0;
+    typename B2bMma::IteratorB0::TensorRef ref_B0;
+    int64_t head_stride_B0;
+    int64_t batch_stride_B0;
+    typename GmemToAccumLoader::OutputTileIterator::Params params_C0;
+    typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0;
+    int64_t head_stride_C0;
+    int64_t batch_stride_C0;
+    typename B2bMma::IteratorB1::Params params_B1;
+    typename B2bMma::IteratorB1::TensorRef ref_B1;
+    int64_t head_stride_B1;
+    int64_t batch_stride_B1;
+    typename Epilogue::OutputTileIterator::Params params_C1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_C1;
+    int64_t head_stride_C1;
+    int64_t batch_stride_C1;
+    typename Epilogue::OutputTileIterator::Params params_D1;
+    typename Epilogue::OutputTileIterator::TensorRef ref_D1;
+    int64_t head_stride_D1;
+    int64_t batch_stride_D1;
+    int batch_count;
+    int num_heads;
+    typename OutputOp0::Params output_op_0;
+    typename OutputOp1::Params output_op_1;
+    int gemm_k_iterations_0;
+
+    // array of jagged dim offsets
+    // of size batch_count + 1
+    const offset_t *offsets;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): swizzle_log_tile(0) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      cutlass::gemm::GemmCoord const & grid_tiled_shape,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      int64_t head_stride_A0,
+      int64_t batch_stride_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      int64_t head_stride_B0,
+      int64_t batch_stride_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      int64_t head_stride_C0,
+      int64_t batch_stride_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      int64_t head_stride_B1,
+      int64_t batch_stride_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      int64_t head_stride_C1,
+      int64_t batch_stride_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1,
+      int64_t head_stride_D1,
+      int64_t batch_stride_D1,
+      int batch_count,
+      int num_heads,
+      const offset_t *offsets_,
+      typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(),
+      typename OutputOp1::Params output_op_1 = typename OutputOp1::Params()
+    ):
+      problem_size_0(problem_size_0),
+      problem_size_1(problem_size_1),
+      grid_tiled_shape(grid_tiled_shape),
+      swizzle_log_tile(ThreadblockSwizzle().get_log_tile(grid_tiled_shape)),
+      params_A0(ref_A0.layout()),
+      ref_A0(ref_A0),
+      head_stride_A0(head_stride_A0),
+      batch_stride_A0(batch_stride_A0),
+      params_B0(ref_B0.layout()),
+      ref_B0(ref_B0),
+      head_stride_B0(head_stride_B0),
+      batch_stride_B0(batch_stride_B0),
+      params_C0(ref_C0.layout()),
+      ref_C0(ref_C0),
+      head_stride_C0(head_stride_C0),
+      batch_stride_C0(batch_stride_C0),
+      params_B1(ref_B1.layout()),
+      ref_B1(ref_B1),
+      head_stride_B1(head_stride_B1),
+      batch_stride_B1(batch_stride_B1),
+      params_C1(ref_C1.layout()),
+      ref_C1(ref_C1),
+      head_stride_C1(head_stride_C1),
+      batch_stride_C1(batch_stride_C1),
+      params_D1(ref_D1.layout()),
+      ref_D1(ref_D1),
+      head_stride_D1(head_stride_D1),
+      batch_stride_D1(batch_stride_D1),
+      output_op_0(output_op_0),
+      output_op_1(output_op_1),
+      batch_count(batch_count),
+      num_heads(num_heads),
+      offsets{offsets_},
+      gemm_k_iterations_0((problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK) {}
+  };
+
+  /// Shared memory storage structure
+  union SharedStorage {
+    typename B2bMma::B2bMmaSharedStorage main_loop;
+    typename Epilogue::SharedStorage epilogue;
+    typename GmemToAccumLoader::SharedStorage gmem_to_accum_loader;
+  };
+
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  GroupedB2bGemmBatched() { }
+
+  /// Determines whether kernel satisfies alignment
+    static Status can_implement(
+      cutlass::gemm::GemmCoord const & problem_size_0,
+      cutlass::gemm::GemmCoord const & problem_size_1,
+      typename B2bMma::IteratorA0::TensorRef ref_A0,
+      typename B2bMma::IteratorB0::TensorRef ref_B0,
+      typename GmemToAccumLoader::OutputTileIterator::TensorRef ref_C0,
+      typename B2bMma::IteratorB1::TensorRef ref_B1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_C1,
+      typename Epilogue::OutputTileIterator::TensorRef ref_D1) {
+
+    static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements;
+    static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements;
+    static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess;
+
+    if (!TensorRef_aligned(ref_A0, kAlignmentA)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B0, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C0, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_B1, kAlignmentB)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_C1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if (!TensorRef_aligned(ref_D1, kAlignmentC)) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) ||
+      (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) ||
+      (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) ||
+      (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) ||
+      (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) ||
+      (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) {
+
+      return Status::kErrorMisalignedOperand;
+    }
+
+    // Determine if fusion sizes are valid
+    if(problem_size_0.m() != problem_size_1.m())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() != problem_size_1.k())
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_0.n() > B2bMma::Shape0::kN)
+      return Status::kErrorInvalidProblem;
+
+    if(problem_size_1.n() > B2bMma::Shape1::kN)
+      return Status::kErrorInvalidProblem;
+
+    return Status::kSuccess;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+
+    // Compute threadblock location
+    ThreadblockSwizzle threadblock_swizzle;
+    cutlass::gemm::GemmCoord threadblock_tile_offset =
+        threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+    // Early exit if CTA is out of range
+    if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() ||
+      params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) {
+
+      return;
+    }
+
+
+    // Each CTA handles multiple batch indices to accommodate limited range of CUDA grid's Z dimension
+    for (int batch_head_idx = threadblock_swizzle.get_batch_idx(); batch_head_idx < params.batch_count * params.num_heads; batch_head_idx += gridDim.z) {
+
+
+      // Compute position within threadblock
+      int thread_idx = threadIdx.x;
+
+      // Convert blockIdx.z into (batch_idx, head_idx).
+      int batch_idx = batch_head_idx / params.num_heads;
+      int head_idx = batch_head_idx % params.num_heads;
+
+      offset_t jagged_offset_start = params.offsets[batch_idx];
+      offset_t jagged_seq_len = params.offsets[batch_idx+1] - jagged_offset_start;
+
+      // early exit
+      if ((threadblock_tile_offset.m() * B2bMma::Shape0::kM >= jagged_seq_len) or (threadblock_tile_offset.n() * B2bMma::Shape0::kN >= jagged_seq_len)) {
+          continue;
+      }
+      // Compute initial location in logical coordinates
+      cutlass::MatrixCoord tb_offset_A0{
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        0,
+      };
+
+      cutlass::MatrixCoord tb_offset_B0{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      };
+
+      cutlass::MatrixCoord tb_offset_B1{
+        0,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      };
+
+      // Construct iterators to A and B operands
+      typename B2bMma::IteratorA0 iterator_A0(
+        params.params_A0,
+        params.ref_A0.data(),
+        { jagged_seq_len, params.problem_size_0.k() }, // A0 matrix size
+        thread_idx,
+        tb_offset_A0);
+      auto const A0_ptr_offset = params.ref_A0.stride(0) * jagged_offset_start + params.head_stride_A0 * head_idx;
+      iterator_A0.add_pointer_offset(A0_ptr_offset);
+      typename B2bMma::IteratorB0::Element *A0_matrix_base_ptr = params.ref_A0.data() + A0_ptr_offset;
+
+      typename B2bMma::IteratorB0 iterator_B0(
+        params.params_B0,
+        params.ref_B0.data(),
+        { params.problem_size_0.k(), jagged_seq_len }, // B0 matrix size
+        thread_idx,
+        tb_offset_B0);
+      auto const B0_ptr_offset = params.ref_B0.stride(0) * jagged_offset_start + params.head_stride_B0 * head_idx;
+      iterator_B0.add_pointer_offset(B0_ptr_offset);
+      typename B2bMma::IteratorB0::Element *B0_matrix_base_ptr  =  params.ref_B0.data() + B0_ptr_offset;
+
+      typename B2bMma::IteratorB1 iterator_B1(
+        params.params_B1,
+        params.ref_B1.data(),
+        params.problem_size_1.kn(),
+        thread_idx,
+        tb_offset_B1);
+      auto const B1_ptr_offset = params.ref_B1.stride(0) * jagged_offset_start +  params.head_stride_B1 * head_idx;
+      iterator_B1.add_pointer_offset(B1_ptr_offset);
+      typename B2bMma::IteratorB1::Element *B1_matrix_base_ptr  =  params.ref_B1.data() + B1_ptr_offset;
+
+
+      // Broadcast the warp_id computed by lane 0 to ensure dependent code
+      // is compiled as warp-uniform.
+      int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0);
+      int lane_idx = threadIdx.x % 32;
+
+      // assume identity swizzle
+      MatrixCoord tb_offset_C0(
+        threadblock_tile_offset.m() * B2bMma::Shape0::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape0::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename GmemToAccumLoader::OutputTileIterator iterator_C0(
+        params.params_C0,
+        params.ref_C0.data(),
+        make_Coord(jagged_seq_len, jagged_seq_len),
+        thread_idx,
+        tb_offset_C0
+      );
+
+      iterator_C0.add_pointer_offset(params.batch_stride_C0 * batch_idx + params.head_stride_C0 * head_idx);
+
+      //
+      // Main loop
+      //
+
+      OutputOp0 output_op_0(params.output_op_0);
+
+      // Construct thread-scoped matrix multiply
+      B2bMma b2bMma(shared_storage.main_loop, shared_storage.gmem_to_accum_loader, thread_idx, warp_idx, lane_idx,
+            jagged_seq_len, static_cast<int>(params.problem_size_0.k()), static_cast<int>(params.problem_size_1.n()),
+            A0_matrix_base_ptr, static_cast<int>(params.ref_A0.stride(0)),
+            B0_matrix_base_ptr, static_cast<int>(params.ref_B0.stride(0)),
+            B1_matrix_base_ptr, static_cast<int>(params.ref_B1.stride(0)));
+
+      typename B2bMma::FragmentC0 src_accum;
+      typename B2bMma::FragmentC1 accumulators;
+
+      src_accum.clear();
+      accumulators.clear();
+
+      // Compute threadblock-scoped matrix multiply-add
+      b2bMma(
+        params.gemm_k_iterations_0,
+        accumulators,
+        iterator_A0,
+        iterator_B0,
+        iterator_C0,
+        iterator_B1,
+        src_accum,
+        output_op_0);
+
+      //
+      // Epilogue
+      //
+
+      OutputOp1 output_op_1(params.output_op_1);
+
+      //
+      // Masked tile iterators constructed from members
+      //
+
+      threadblock_tile_offset =
+          threadblock_swizzle.get_tile_offset(params.swizzle_log_tile);
+
+      //assume identity swizzle
+      MatrixCoord threadblock_offset(
+        threadblock_tile_offset.m() * B2bMma::Shape1::kM,
+        threadblock_tile_offset.n() * B2bMma::Shape1::kN
+      );
+
+      // Tile iterator loading from source tensor.
+      typename Epilogue::OutputTileIterator iterator_C1(
+        params.params_C1,
+        params.ref_C1.data(),
+        params.problem_size_1.mn(),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_C1.add_pointer_offset(params.batch_stride_C1 * batch_idx + params.head_stride_C1 * head_idx);
+
+      // Tile iterator writing to destination tensor.
+      typename Epilogue::OutputTileIterator iterator_D1(
+        params.params_D1,
+        params.ref_D1.data(),
+        make_Coord(jagged_seq_len, params.problem_size_1.n()),
+        thread_idx,
+        threadblock_offset
+      );
+
+      iterator_D1.add_pointer_offset(params.ref_D1.stride(0) * jagged_offset_start + params.head_stride_D1 * head_idx);
+
+      Epilogue epilogue(
+        shared_storage.epilogue,
+        thread_idx,
+        warp_idx,
+        lane_idx);
+
+      // Execute the epilogue operator to update the destination tensor.
+      epilogue(output_op_1, iterator_D1, accumulators, iterator_C1);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
new file mode 100644
index 000000000..8ff051cb1
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/kernel/default_b2b_batched_gemm.h
@@ -0,0 +1,225 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+      This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_pipelined.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "grouped_classic_b2b_bmm/kernel/b2b_batched_gemm.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+  /// Element type for A matrix operand
+  typename ElementA_,
+  /// Layout type for A matrix operand
+  typename LayoutA_,
+  /// Access granularity of A matrix in units of elements
+  int kAlignmentA,
+  /// Element type for B matrix operand
+  typename ElementB_,
+  /// Layout type for B matrix operand
+  typename LayoutB_,
+  /// Access granularity of B matrix in units of elements
+  int kAlignmentB,
+  /// Layout type for B1 matrix operand
+  typename LayoutB1_,
+  /// Element type for C and D matrix operands
+  typename ElementC_,
+  /// Layout type for C and D matrix operands
+  typename LayoutC_,
+  /// Element type for internal accumulation
+  typename ElementAccumulator,
+  /// Operator class tag
+  typename OperatorClass,
+  /// Tag indicating architecture to tune for
+  typename ArchTag,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape0,
+  /// Threadblock-level tile size (concept: GemmShape)
+  typename ThreadblockShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape0,
+  /// Warp-level tile size (concept: GemmShape)
+  typename WarpShape1,
+  /// Warp-level tile size (concept: GemmShape)
+  typename InstructionShape,
+  /// Epilogue output operator
+  typename EpilogueOutputOp0,
+  /// Epilogue output operator
+  typename EpilogueOutputOp1,
+  /// Threadblock-level swizzling operator
+  typename ThreadblockSwizzle,
+  /// Number of stages used in the pipelined mainloop
+  int Stages,
+  /// Operation performed by GEMM
+  typename Operator,
+  /// Apply upper triangular causal mask after first gemm
+  bool CausalMaskAfterGemm0 = false,
+  /// Stage accumulator in shared memory
+  bool SmemAccumulator = false,
+  /// Element type for offsets / array indices
+  typename offset_t_ = int
+>
+struct DefaultGroupedB2bGemmBatched;
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for Ampere Architecture
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for C and D matrix operands
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Epilogue output operator
+    typename EpilogueOutputOp0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp1,
+    /// Threadblock-level swizzling operator
+    typename ThreadblockSwizzle,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Element type for offsets / array indices
+    typename offset_t_>
+struct DefaultGroupedB2bGemmBatched<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1, ElementC,
+                   layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp,
+                   arch::Sm80, ThreadblockShape0, ThreadblockShape1,
+                   WarpShape0, WarpShape1, InstructionShape,
+                   EpilogueOutputOp0, EpilogueOutputOp1, ThreadblockSwizzle, Stages,
+                   Operator, CausalMaskAfterGemm0, false, offset_t_> {
+
+  // TODO: Make pipelined (i.e. stages == 2) work.
+  static_assert((Stages >= 3), "Currently, only multistage is supported (not pipelined).");
+
+  // While we ought to debug it, the warp shape M restriction is not considered
+  // high-priority as we do not want to make warp M much larger anyway.
+  static_assert(
+    !CausalMaskAfterGemm0 || (WarpShape0::kM == 16),
+    "Currently, causal mask is only supported with warp shape M of 16."
+  );
+
+
+  /// Define the threadblock-scoped matrix multiply-accumulate
+  using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma<
+      ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, LayoutB1,
+      ElementC, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80,
+      ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1,
+      InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp0>::ThreadblockB2bMma;
+
+  static const int kPartitionsK0 = ThreadblockShape0::kK / WarpShape0::kK;
+  static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK;
+
+  /// Define the epilogue
+  using Epilogue =
+      typename cutlass::epilogue::threadblock::classic_b2b_bmm::DefaultEpilogueTensorOp<
+          ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1,
+          EpilogueOutputOp1::kCount>::Epilogue;
+
+  /// Define the kernel-level GEMM operator.
+  using GroupedB2bGemmBatchedKernel = kernel::GroupedB2bGemmBatched<B2bMma, Epilogue, ThreadblockSwizzle, typename B2bMma::GmemToAccumLoader, offset_t_>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h b/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h
new file mode 100644
index 000000000..b513f958a
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/thread/linear_combination_triu.h
@@ -0,0 +1,136 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include <cutlass/half.h>
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+
+namespace thread {
+
+template <
+  typename ElementOutput_,                             ///< Data type used to load and store tensors
+  int Count,                                           ///< Number of elements computed per operation
+                                                       ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+                                                       ///< but we use 64 or 32 sometimes when there are not enough data to store
+  int ThreadBlockShapeM,
+  typename ElementAccumulator_ = ElementOutput_,       ///< Accumulator data type
+  typename ElementCompute_ = ElementOutput_,           ///< Data type used to compute linear combination
+  FloatRoundStyle Round = FloatRoundStyle::round_to_nearest
+>
+class LinearCombinationTriu {
+public:
+
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+  static int const kThreadBlockShapeM = ThreadBlockShapeM;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using FragmentCompute = Array<ElementCompute, kCount>;
+
+  static FloatRoundStyle const kRound = Round;
+
+  static bool const kIsHeavy = false;
+
+public:
+
+  /// Constructs the function object, possibly loading from pointers in host memory
+  CUTLASS_HOST_DEVICE
+  LinearCombinationTriu() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return false;
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+    FragmentAccumulator const &accumulator, int index, int n, int m) const {
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round> accumulator_converter;
+
+    FragmentCompute converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    FragmentCompute intermediate = converted_accumulator;
+
+    for (int i = 0; i < kCount; i++) {
+
+      int row = (
+        (kThreadBlockShapeM * blockIdx.x) +
+        (16 * (threadIdx.x / 32)) +
+        (8 * (i / 2)) +
+        ((threadIdx.x % 32) / 4)
+      );
+      int col = (
+        (16 * index) +
+        (8 * n) +
+        (2 * (threadIdx.x % 4)) +
+        (i % 2)
+      );
+
+      intermediate[i] = intermediate[i] * ElementCompute(row <= col);
+
+    }
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round> destination_converter;
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+}
+}
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h
new file mode 100644
index 000000000..d2460cce9
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h
@@ -0,0 +1,241 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  using Shape1 = Shape1_;
+
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+  using Policy1 = Policy1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+  using Operator1 = typename Policy1::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm0 = typename Policy0::Operator::Shape;
+  using WarpGemm1 = typename Policy1::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount0 = GemmShape<Shape0::kM / WarpGemm0::kM,
+                               Shape0::kN / WarpGemm0::kN,
+                               Shape0::kK / WarpGemm0::kK>;
+  using WarpCount1 = GemmShape<Shape1::kM / WarpGemm1::kM,
+                               Shape1::kN / WarpGemm1::kN,
+                               Shape1::kK / WarpGemm1::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations0 =
+      (WarpGemm0::kK / Operator0::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 =
+      (WarpGemm1::kK / Operator1::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template<
+    typename Shape_,
+    typename Policy_
+  >
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+    using Shape = Shape_;
+    using Policy = Policy_;
+    using Operator = typename Policy::Operator;
+
+    /// Tensor reference to the A operand
+    using TensorRefA = TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+    /// Tensor reference to the B operand
+    using TensorRefB = TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+
+    /// Shape of the A matrix operand in shared memory
+    using ShapeA = MatrixShape<Shape::kM + Policy::SmemPaddingA::kRow,
+                               Shape::kK * kStages +
+                                   Policy::SmemPaddingA::kColumn>;
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB =
+        MatrixShape<Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+                    Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for A operand
+    AlignedBuffer<typename Operator::ElementA, ShapeA::kCount> operand_A;
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the A matrix
+    CUTLASS_DEVICE
+    static typename Operator::LayoutA LayoutA() {
+      return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn});
+    }
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the A operand
+    CUTLASS_HOST_DEVICE
+    TensorRefA operand_A_ref() {
+      return TensorRefA{operand_A.data(), LayoutA()};
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+  using SharedStorage0 = SharedStorage<Shape0, Policy0>;
+  using SharedStorage1 = SharedStorage<Shape1, Policy1>;
+  union B2bMmaSharedStorage {
+    SharedStorage0 shared_storage0;
+    SharedStorage1 shared_storage1;
+  };
+
+
+ protected:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A0 operand from shared memory
+  typename Operator0::IteratorA warp_tile_iterator_A0_;
+
+  /// Iterator to load a warp-scoped tile of B0 operand from shared memory
+  typename Operator0::IteratorB warp_tile_iterator_B0_;
+
+  /// Iterator to load a warp-scoped tile of B1 operand from shared memory
+  typename Operator1::IteratorB warp_tile_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      B2bMmaSharedStorage &shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx
+    ):
+      warp_tile_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), lane_idx),
+      warp_tile_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), lane_idx),
+      warp_tile_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), lane_idx) {
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
new file mode 100644
index 000000000..7ff9543ae
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h
@@ -0,0 +1,925 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/memory_sm80.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "cutlass/util/device_utils.h"
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
+#include "grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h"
+#include "grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h"
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <typename T>
+CUTLASS_DEVICE
+int32_t int32_ptrdiff(T* a, T *b) {
+  return static_cast<int32_t>(reinterpret_cast<char*>(a) - reinterpret_cast<char*>(b)) / sizeof(T);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape0_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA0_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA0_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA0,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB0_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB0_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB0,
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile
+    //  (concept::MmaTensorOpFragmentIterator)
+    typename FragmentIteratorA1_,
+    /// Iterates over vectors of scale and bias vector in global memory
+    //  (concept: VectorIterator)
+    typename IteratorAccumulatorScaleBias_,
+    /// WarpIterator to load Scale or Bias vector from threadblock fragment
+    typename FragmentIteratorA1ScaleBias_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of output matrix
+    typename ElementOutput_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+    typename OutputOp_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy0_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages,
+    bool CausalMaskAfterGemm0,
+    typename WarpShape0_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class B2bMmaMultistage :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages> {
+public:
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape0 = Shape0_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA0 = IteratorA0_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB0 = IteratorB0_;
+  ///< Policy describing tuning details
+  using Policy0 = Policy0_;
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over intermediate accumulator tile
+  using FragmentIteratorA1 = FragmentIteratorA1_;
+  ///< Iterates over tiles of the scale and bias vectors in global memory
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;
+  ///< WarpIterator to load Scale or Bias vector from threadblock fragment
+  using FragmentIteratorA1ScaleBias = FragmentIteratorA1ScaleBias_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+  ///< Data type of output matrix
+  using ElementOutput = ElementOutput_;
+ ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  ///< Epilogue after 1st Gemm
+  using OutputOp = OutputOp_;
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA0 = CacheOpA0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB0 = CacheOpB0;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  static const int kPartitionsK0 = Shape0_::kK / WarpShape0_::kK;
+
+  using GmemToAccumLoader =
+      typename cutlass::epilogue::threadblock::DefaultGmemToAccumLoaderTensorOp<
+          Shape0_, Operator0, kPartitionsK0, OutputOp,
+          OutputOp::kCount>::GmemToAccumLoader;
+
+  using IteratorC0 = typename GmemToAccumLoader::OutputTileIterator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+
+    static_assert(Base::kWarpGemmIterations0 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+    static_assert(Base::kWarpGemmIterations1 > 1,
+                  "The pipelined structure requires at least two warp-level "
+                  "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const TBLDGSTSIterationsA0 =
+        IteratorA0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB0 =
+        IteratorB0::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLDGSTSIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA0 =
+        (TBLDGSTSIterationsA0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB0 =
+        (TBLDGSTSIterationsB0 + Base::kWarpGemmIterations0 - 1) / Base::kWarpGemmIterations0;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLDGSTSIterationsB1 + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+  };
+
+ private:
+
+  using WarpLoadedFragmentA0 = typename Operator0::FragmentA;
+  using WarpLoadedFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpLoadedFragmentA1 = typename FragmentIteratorA1::Fragment;
+  using WarpLoadedFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA0 = typename Operator0::TransformedFragmentA;
+  using WarpTransformedFragmentB0 = typename Operator0::TransformedFragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  GmemToAccumLoader gmem_to_accum_loader;
+
+  int const jagged_sequence_length;
+
+  int const seq_stride_A0;
+  typename IteratorA0::Element *A0_matrix_base_ptr;
+
+  int const seq_stride_B0;
+  typename IteratorB0::Element *B0_matrix_base_ptr;
+
+  int const seq_stride_B1;
+  typename IteratorB1::Element *B1_matrix_base_ptr;
+
+  int const qk_dims;
+  int const v_dims;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::B2bMmaSharedStorage &shared_storage,
+      typename GmemToAccumLoader::SharedStorage &bias_add_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int jagged_sequence_length_,
+      int qk_dims_,
+      int v_dims_,
+      // extra params
+      typename IteratorA0::Element *A0_matrix_base_ptr_,
+      int seq_stride_A0_,
+
+      typename IteratorB0::Element *B0_matrix_base_ptr_,
+      int seq_stride_B0_,
+
+      typename IteratorB1::Element *B1_matrix_base_ptr_,
+      int seq_stride_B1_
+    ):
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      smem_iterator_A0_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+      smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+      smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx),
+      gmem_to_accum_loader(bias_add_shared_storage, thread_idx, warp_idx, lane_idx),
+      jagged_sequence_length(jagged_sequence_length_),
+      A0_matrix_base_ptr(A0_matrix_base_ptr_),
+      seq_stride_A0(seq_stride_A0_),
+
+      B0_matrix_base_ptr(B0_matrix_base_ptr_),
+      seq_stride_B0(seq_stride_B0_),
+
+      seq_stride_B1(seq_stride_B1_),
+      B1_matrix_base_ptr(B1_matrix_base_ptr_),
+      qk_dims(qk_dims_),
+      v_dims(v_dims_)
+  {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations0 * warp_idx_k});
+    this->warp_tile_iterator_B0_.add_tile_offset(
+        {Base::kWarpGemmIterations0 * warp_idx_k, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k, warp_idx_n});
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_0(IteratorA0 &iterator_A0, IteratorB0 &iterator_B0,
+                             int const group_start_A0 = 0,  int const group_start_B0 = 0) {
+    iterator_A0.set_iteration_index(group_start_A0 *
+                                   IteratorA0::kAccessesPerVector);
+    this->smem_iterator_A0_.set_iteration_index(group_start_A0);
+    // LDGSTS for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA0; ++j) {
+      if (group_start_A0 + j < Detail::TBLDGSTSIterationsA0) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorA0::Element>::value *
+                              IteratorA0::ThreadMap::kElementsPerAccess /
+                              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorA0::Element *>(gmem_ptr), A0_matrix_base_ptr);
+          const int32_t outer_offset = offset / this->seq_stride_A0;
+          const int32_t inner_offset = offset % this->seq_stride_A0;
+          const bool iterA0valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterA0valid);
+
+          ++iterator_A0;
+        }
+
+
+
+        ++this->smem_iterator_A0_;
+      }
+    }
+
+    iterator_B0.set_iteration_index(group_start_B0 *
+                                   IteratorB0::kAccessesPerVector);
+    this->smem_iterator_B0_.set_iteration_index(group_start_B0);
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB0; ++j) {
+      if (group_start_B0 + j < Detail::TBLDGSTSIterationsB0) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorB0::Element>::value *
+                              IteratorB0::ThreadMap::kElementsPerAccess /
+                              IteratorB0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B0.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB0::Element *>(gmem_ptr), B0_matrix_base_ptr);
+          const int32_t row_offset = offset / this->seq_stride_B0;
+          const int32_t col_offset = offset % this->seq_stride_B0;
+          bool iterB0valid = ((row_offset<this->jagged_sequence_length) and (col_offset<this->qk_dims));
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterB0valid);
+          ++iterator_B0;
+        }
+        ++this->smem_iterator_B0_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(IteratorB1 &iterator_B1,
+                              int const group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(group_start_B1 *
+                                   IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+    // LDGSTS for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLDGSTSIterationsB1) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+
+        constexpr int kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+                              IteratorB1::ThreadMap::kElementsPerAccess /
+                              IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr), B1_matrix_base_ptr);
+          const int32_t outer_offset = offset / this->seq_stride_B1;
+          const int32_t inner_offset = offset % this->seq_stride_B1;
+          const bool iterB1valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterB1valid);
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_0,
+      ///< destination accumulator tile
+      FragmentC1 &accum,
+      ///< iterator over A0 operand in global memory
+      IteratorA0 iterator_A0,
+      ///< iterator over B0 operand in global memory
+      IteratorB0 iterator_B0,
+      ///< iterator over C0 operand in global memory
+      IteratorC0 iterator_C0,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC0 const &src_accum,
+      ///< epilogue operation after 1st Gemm
+      OutputOp output_op_0)
+    {
+    //
+    // Prologue
+    //
+
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage, --gemm_k_iterations_0) {
+
+      iterator_A0.set_iteration_index(0);
+      this->smem_iterator_A0_.set_iteration_index(0);
+
+      // LDGSTS for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsA0; ++j) {
+        typename IteratorA0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorA0::AccessType *>(
+                this->smem_iterator_A0_.get());
+        constexpr int kSrcBytes =
+              sizeof_bits<typename IteratorA0::Element>::value *
+              IteratorA0::ThreadMap::kElementsPerAccess /
+              IteratorA0::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA0::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A0.get();
+          int32_t const offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr), A0_matrix_base_ptr);
+          int32_t const outer_offset = offset / this->seq_stride_A0;
+          int32_t const inner_offset = offset % this->seq_stride_A0;
+          bool iterA0valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->qk_dims));
+          // zfill used here also in original code
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA0>(
+              dst_ptr + v, gmem_ptr, iterA0valid);
+          ++iterator_A0;
+        }
+
+        ++this->smem_iterator_A0_;
+      }
+
+      iterator_B0.set_iteration_index(0);
+      this->smem_iterator_B0_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB0; ++j) {
+        typename IteratorB0::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB0::AccessType *>(
+                this->smem_iterator_B0_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB0::kAccessesPerVector; ++v) {
+          constexpr int kSrcBytes =
+              sizeof_bits<typename IteratorB0::Element>::value *
+              IteratorB0::ThreadMap::kElementsPerAccess /
+              IteratorB0::kAccessesPerVector / 8;
+          auto gmem_ptr = iterator_B0.get();
+          int32_t const offset = int32_ptrdiff(reinterpret_cast<typename IteratorB0::Element *>(gmem_ptr), B0_matrix_base_ptr);
+          int32_t const row_offset = offset / this->seq_stride_B0;
+          int32_t const col_offset = offset % this->seq_stride_B0;
+          bool iterB0valid = ((row_offset<this->jagged_sequence_length) and (col_offset<this->qk_dims));
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB0>(
+              dst_ptr + v, gmem_ptr, iterB0valid);
+
+          ++iterator_B0;
+        }
+
+        ++this->smem_iterator_B0_;
+      }
+
+      // Move to the next stage
+      iterator_A0.add_tile_offset({0, 1});
+      iterator_B0.add_tile_offset({1, 0});
+
+      this->smem_iterator_A0_.add_tile_offset({0, 1});
+      this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA0 warp_loaded_frag_A0[2];
+    WarpLoadedFragmentB0 warp_loaded_frag_B0[2];
+    WarpTransformedFragmentA0 warp_transformed_frag_A0[2];
+    WarpTransformedFragmentB0 warp_transformed_frag_B0[2];
+
+    Operator0 warp_mma0;
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma0.transform(warp_transformed_frag_A0[0], warp_transformed_frag_B0[0],
+                       warp_loaded_frag_A0[0], warp_loaded_frag_B0[0]);
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > (-Base::kStages + 1);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0;
+           ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_loaded_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k > 0)
+          warp_mma0.transform(warp_transformed_frag_A0[warp_mma_k % 2],
+                             warp_transformed_frag_B0[warp_mma_k % 2],
+                             warp_loaded_frag_A0[warp_mma_k % 2],
+                             warp_loaded_frag_B0[warp_mma_k % 2]);
+
+        warp_mma0(
+          accum0,
+          warp_transformed_frag_A0[warp_mma_k % 2],
+          warp_transformed_frag_B0[warp_mma_k % 2],
+          accum0
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations0 - 1) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+
+          group_start_iteration_A0 = warp_mma_k * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 = warp_mma_k * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations0) {
+          int group_start_iteration_A0, group_start_iteration_B0;
+          group_start_iteration_A0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupA0;
+          group_start_iteration_B0 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB0;
+
+          copy_tiles_and_advance_0(iterator_A0, iterator_B0, group_start_iteration_A0,
+                               group_start_iteration_B0);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A0.add_tile_offset({0, 1});
+          iterator_B0.add_tile_offset({1, 0});
+
+          this->smem_iterator_A0_.add_tile_offset({0, 1});
+          this->smem_iterator_B0_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A0_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK *
+                        Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK *
+                     Base::kWarpGemmIterations0,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations_0;
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations0)
+          warp_mma0.transform(warp_transformed_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A0[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B0[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+    // Apply bias add
+    gmem_to_accum_loader(output_op_0, accum0, iterator_C0);
+    __syncthreads();
+
+
+    // 2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+    typename FragmentIteratorA1::OutputOp noop_output_op_0({});
+    TriuMmaTensorOpFragmentIterator<FragmentIteratorA1, Shape0::kM> triu_warp_tile_iterator_A1_;
+
+    //
+    // Prologue
+    //
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < Base::kStages - 1;
+         ++stage) {
+      iterator_B1.set_iteration_index(0);
+      this->smem_iterator_B1_.set_iteration_index(0);
+
+      // LDGSTS for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLDGSTSIterationsB1; ++j) {
+        typename IteratorB1::AccessType *dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType *>(
+                this->smem_iterator_B1_.get());
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          constexpr int kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+          auto gmem_ptr = iterator_B1.get();
+          const int32_t offset = int32_ptrdiff(reinterpret_cast<typename IteratorB1::Element *>(gmem_ptr),B1_matrix_base_ptr);
+
+          const int32_t outer_offset = offset / this->seq_stride_B1;
+          const int32_t inner_offset = offset % this->seq_stride_B1;
+          const bool iterB1valid = ((outer_offset<this->jagged_sequence_length) and (inner_offset<this->v_dims));
+          // zfill also used in original code
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterB1valid);
+
+          ++iterator_B1;
+        }
+
+        ++this->smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<Base::kStages - 2>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0], noop_output_op_0);
+    if (CausalMaskAfterGemm0) {
+      triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+      ++triu_warp_tile_iterator_A1_;
+    }
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+    this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B1_;
+    smem_write_stage_idx = Base::kStages - 1;
+    smem_read_stage_idx = 0;
+
+    warp_mma1.transform(warp_transformed_frag_A1[0], warp_transformed_frag_B1[0],
+                       warp_loaded_frag_A1[0], warp_loaded_frag_B1[0]);
+
+    //
+    // Mainloop
+    //
+    constexpr int max_gemm_k_iterations_1 = (FragmentIteratorA1::Policy::kIterations + Base::kWarpGemmIterations1 - 1) / Base::kWarpGemmIterations1;
+    const int dyn_max_gemm_k_iterations_1 = (jagged_sequence_length + Shape1::kK - 1) / Shape1::kK;
+    // We need to have a second counter variable to early exit the unrolled loop
+    // for compiler-internal reasons, if we use the main loop counter to determine early exit, it will
+    // prevent loop unrolling, this will lead to increased register usage, and much lower performance.
+    int counter = 0;
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1=0; gemm_k_iterations_1 < max_gemm_k_iterations_1; gemm_k_iterations_1++) {
+
+      // early exit out of unrolled loop, so we can have a dynamic number of sequences
+      // despite being an unrolled loop that uses few registers
+      if (counter++ >= dyn_max_gemm_k_iterations_1) {
+         break;
+      }
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+          ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment
+        warp_tile_iterator_A1_.load(
+            warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+            noop_output_op_0
+        );
+        if (CausalMaskAfterGemm0) {
+          triu_warp_tile_iterator_A1_.load(warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          ++triu_warp_tile_iterator_A1_;
+        }
+        ++warp_tile_iterator_A1_;
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        this->warp_tile_iterator_B1_.load(warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(warp_transformed_frag_A1[warp_mma_k % 2],
+                             warp_transformed_frag_B1[warp_mma_k % 2],
+                             warp_loaded_frag_A1[warp_mma_k % 2],
+                             warp_loaded_frag_B1[warp_mma_k % 2]);
+
+
+        warp_mma1(
+          accum,
+          warp_transformed_frag_A1[warp_mma_k % 2],
+          warp_transformed_frag_B1[warp_mma_k % 2],
+          accum
+        );
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<Base::kStages - 2>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                             warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+
+    }
+
+
+
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace threadblock
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
new file mode 100644
index 000000000..b60ce870f
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h
@@ -0,0 +1,562 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/array.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/numeric_types.h"
+#include "cutlass/matrix_shape.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions.
+template <
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape0_,
+  /// Iterates over tiles of A operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorA0_,
+  /// Iterates over tiles of A operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorA0_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB0_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB0_,
+  /// Size of the Gemm problem - concept: gemm::GemmShape<>
+  typename Shape1_,
+  /// Iterates over the intermediate accumulator tile
+  //  (concept::MmaTensorOpFragmentIterator)
+  typename FragmentIteratorA1_,
+  /// Iterates over vectors of scale and bias vector in global memory
+  //  (concept: VectorIterator)
+  typename IteratorAccumulatorScaleBias_,
+  /// FragmentIterator to load Scale or Bias vector from threadblock fragment
+  typename FragmentIteratorA1ScaleBias_,
+  /// Iterates over tiles of B operand in global memory
+  //  (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator)
+  typename IteratorB1_,
+  /// Iterates over tiles of B operand in shared memory
+  /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+  typename SmemIteratorB1_,
+  /// Data type of output matrix
+  typename ElementOutput_,
+  /// Data type of accumulator matrix
+  typename ElementC_,
+  /// Data type of accumulator matrix
+  typename LayoutC_,
+  /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...)
+  typename OutputOp_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy0_,
+  /// Policy describing tuning details (concept: MmaPipelinedPolicy)
+  typename Policy1_,
+  /// Transformation applied to A0 operand
+  typename TransformA0_ = NumericArrayConverter<
+    typename SmemIteratorA0_::Element,
+    typename IteratorA0_::Element,
+    IteratorA0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B0 operand
+  typename TransformB0_ = NumericArrayConverter<
+    typename SmemIteratorB0_::Element,
+    typename IteratorB0_::Element,
+    IteratorB0_::Fragment::kElements>,
+  ///
+  /// Transformation applied to B1 operand
+  typename TransformB1_ = NumericArrayConverter<
+    typename SmemIteratorB1_::Element,
+    typename IteratorB1_::Element,
+    IteratorB1_::Fragment::kElements>,
+  /// Used for partial specialization
+  typename Enable = bool
+>
+class B2bMmaPipelined :
+  public B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2> {
+public:
+
+  ///< Base class
+  using Base = B2bMmaBase<Shape0_, Shape1_, Policy0_, Policy1_, 2>;
+
+  using Shape0 = Shape0_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA0 = IteratorA0_;     ///< Iterates over tiles of A operand in global memory
+  using IteratorB0 = IteratorB0_;     ///< Iterates over tiles of B operand in global memory
+  using Policy0 = Policy0_;           ///< Policy describing tuning details
+
+  using SmemIteratorA0 = SmemIteratorA0_;
+  using SmemIteratorB0 = SmemIteratorB0_;
+
+  using Shape1 = Shape1_;             ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile
+  using IteratorAccumulatorScaleBias = IteratorAccumulatorScaleBias_;   ///< Iterates over tiles of the scale and bias vectors in global memory
+  using FragmentIteratorA1ScaleBias =
+    FragmentIteratorA1ScaleBias_;     ///< WarpIterator to load Scale or Bias vector from the threadblock fragment
+  using IteratorB1 = IteratorB1_;     ///< Iterates over tiles of B operand in global memory
+  using Policy1 = Policy1_;           ///< Policy describing tuning details
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+
+
+  using ElementOutput = ElementOutput_;       ///< Data type of output matrix
+  using ElementC = ElementC_;       ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_;         ///< Layout of accumulator matrix
+
+  using OutputOp = OutputOp_;       ///< Epilogue after 1st Gemm
+
+  static const bool PerChannelScale = (OutputOp::kScale ==
+      epilogue::thread::ScaleType::OnlyAlphaPerChannelScaling);
+
+  using TransformA0 = TransformA0_;
+  using TransformB0 = TransformB0_;
+  using TransformB1 = TransformB1_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA0 = typename IteratorA0::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB0 = typename IteratorB0::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC0 = typename Policy0::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator0 = typename Policy0::Operator;
+
+  /// Fragment of Scale and Bias loaded from global memory
+  using FragmentA1ScaleBias = typename IteratorAccumulatorScaleBias::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB1 = typename IteratorB1::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy0::Operator::ArchTag;
+
+  /// Complex transform on A0 operand
+  static ComplexTransform const kTransformA0 = Operator0::kTransformA;
+
+  /// Complex transform on B0 operand
+  static ComplexTransform const kTransformB0 = Operator0::kTransformB;
+
+  /// Complex transform on B1 operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2");
+
+private:
+
+  using WarpFragmentA0 = typename Operator0::FragmentA;
+  using WarpFragmentB0 = typename Operator0::FragmentB;
+  /// Warp Fragment of operand A1 loaded from accmulator tile
+  using WarpFragmentA1 = typename FragmentIteratorA1::Fragment;
+  /// Warp Fragment of operand A1 scale and bias loaded from threadblock fragment
+  using WarpFragmentA1ScaleBias =
+      typename FragmentIteratorA1ScaleBias::Fragment;
+  using WarpFragmentB1 = typename Operator1::FragmentB;
+
+protected:
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA0 smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B0 operand to shared memory
+  SmemIteratorB0 smem_iterator_B0_;
+
+  /// Iterator to write threadblock-scoped tile of B1 operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+public:
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  B2bMmaPipelined(
+    typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM
+    int thread_idx,                                     ///< ID within the threadblock
+    int warp_idx,                                       ///< ID of warp
+    int lane_idx,                                       ///< ID of each thread within a warp
+    int problem_size_0_n                                ///< GEMM0 N is used for accumulator extent
+  ):
+    Base(shared_storage, thread_idx, warp_idx, lane_idx),
+    smem_iterator_A_(shared_storage.shared_storage0.operand_A_ref(), thread_idx),
+    smem_iterator_B0_(shared_storage.shared_storage0.operand_B_ref(), thread_idx),
+    smem_iterator_B1_(shared_storage.shared_storage1.operand_B_ref(), thread_idx) {
+
+
+    // Compute warp location within threadblock tile by mapping the warp_id to three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    //These should stay the same across different GEMM layers
+    int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM;
+
+    //These may change across different GEMM layers
+    int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k;
+    int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0});
+    this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n});
+    this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n});
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+    int gemm_k_iterations_0,                             ///< number of iterations of the mainloop
+    FragmentC1 &accum,                                   ///< destination accumulator tile
+    IteratorA0 iterator_A,                               ///< iterator over A operand in global memory
+    IteratorB0 iterator_B0,                              ///< iterator over B0 operand in global memory
+    IteratorAccumulatorScaleBias iterator_A1_scale,    ///< iterator over A1 operand scale vectors in global memory
+    IteratorAccumulatorScaleBias iterator_A1_bias,     ///< iterator over A1 operand bias vectors in global memory
+    IteratorB1 iterator_B1,                              ///< iterator over B1 operand in global memory
+    FragmentC0 const &src_accum,                         ///< source accumualtor tile
+    OutputOp output_op_0,                                ///< epilogue operation after 1st Gemm
+    TransformA0 transform_A0 = TransformA0(),            ///< transformation applied to A0 fragment
+    TransformB0 transform_B0 = TransformB0(),            ///< transformation applied to B0 fragment
+    TransformB1 transform_B1 = TransformB1()) {          ///< transformation applied to B1 fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    FragmentC0 accum0 = src_accum;
+
+    FragmentA0 tb_frag_A;
+    FragmentB0 tb_frag_B0;
+
+    tb_frag_A.clear();
+    tb_frag_B0.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B0.load(tb_frag_B0);
+
+    ++iterator_A;
+    ++iterator_B0;
+
+    this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+    this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B0_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA0 warp_frag_A0[2];
+    WarpFragmentB0 warp_frag_B0[2];
+
+    this->warp_tile_iterator_A0_.set_kgroup_index(0);
+    this->warp_tile_iterator_B0_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A0_.load(warp_frag_A0[0]);
+    this->warp_tile_iterator_B0_.load(warp_frag_B0[0]);
+
+    ++this->warp_tile_iterator_A0_;
+    ++this->warp_tile_iterator_B0_;
+
+    Operator0 warp_mma0;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations_0 <= 1);
+    iterator_B0.clear_mask(gemm_k_iterations_0 <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing
+    // shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations0 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A0(tb_frag_A));
+
+          this->smem_iterator_B0_.store(transform_B0(tb_frag_B0));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B0_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_A0_.add_tile_offset(
+                {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0});
+            this->warp_tile_iterator_B0_.add_tile_offset(
+                {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+        this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0);
+
+        this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A0_;
+        ++this->warp_tile_iterator_B0_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_A.load(tb_frag_A);
+          iterator_B0.load(tb_frag_B0);
+          ++iterator_A;
+          ++iterator_B0;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations_0 <= 2);
+          iterator_B0.clear_mask(gemm_k_iterations_0 <= 2);
+        }
+
+        warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2],
+                  warp_frag_B0[warp_mma_k % 2], accum0);
+      }
+    }
+
+    //2nd Gemm
+
+    /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile
+    FragmentIteratorA1 warp_tile_iterator_A1_(accum0);
+
+    //
+    // Prologue
+    //
+
+    FragmentA1ScaleBias tb_frag_A1_scale;
+    FragmentA1ScaleBias tb_frag_A1_bias;
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_scale_(tb_frag_A1_scale);
+    FragmentIteratorA1ScaleBias warp_tile_iterator_A1_bias_(tb_frag_A1_bias);
+    FragmentB1 tb_frag_B1;
+
+    if(PerChannelScale)
+        tb_frag_A1_scale.clear();
+    tb_frag_A1_bias.clear();
+    tb_frag_B1.clear();
+
+    // The last kblock is loaded in the prolog
+    if(PerChannelScale)
+        iterator_A1_scale.load(tb_frag_A1_scale);
+    iterator_A1_bias.load(tb_frag_A1_bias);
+    iterator_B1.load(tb_frag_B1);
+
+    if(PerChannelScale)
+        ++iterator_A1_scale;
+    ++iterator_A1_bias;
+    ++iterator_B1;
+
+    this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+    ++this->smem_iterator_B1_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math instructions
+    WarpFragmentA1ScaleBias warp_frag_A1_scale[2];
+    WarpFragmentA1ScaleBias warp_frag_A1_bias[2];
+    WarpFragmentA1 warp_frag_A1[2];
+    WarpFragmentB1 warp_frag_B1[2];
+
+    this->warp_tile_iterator_B1_.set_kgroup_index(0);
+
+    if(PerChannelScale)
+        warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[0]);
+    warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[0]);
+    warp_tile_iterator_A1_.load(warp_frag_A1[0], warp_frag_A1_scale[0],
+        warp_frag_A1_bias[0], output_op_0);
+    this->warp_tile_iterator_B1_.load(warp_frag_B1[0]);
+
+    ++warp_tile_iterator_A1_;
+    if(PerChannelScale)
+        ++warp_tile_iterator_A1_scale_;
+    ++warp_tile_iterator_A1_bias_;
+    ++this->warp_tile_iterator_B1_;
+
+    Operator1 warp_mma1;
+
+    smem_write_stage_idx = 1;
+
+    int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1;
+
+    // Avoid reading out of bounds
+    iterator_B1.clear_mask(gemm_k_iterations_1 <= 1);
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::WarpGemmIterations == 2.
+    CUTLASS_PRAGMA_UNROLL
+    for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) {
+
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) {
+
+        // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group
+        // as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations1 - 1) {
+
+          // Write fragments to shared memory
+          this->smem_iterator_B1_.store(transform_B1(tb_frag_B1));
+
+          __syncthreads();
+          ++this->smem_iterator_B1_;
+
+          // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+          }
+          else {
+            this->warp_tile_iterator_B1_.add_tile_offset(
+                {-Base::kStages * Policy1::kPartitionsK *
+                     Base::kWarpGemmIterations1,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+
+          if(PerChannelScale) {
+              tb_frag_A1_scale.clear();
+              iterator_A1_scale.load(tb_frag_A1_scale);
+              ++iterator_A1_scale;
+            }
+            tb_frag_A1_bias.clear();
+            iterator_A1_bias.load(tb_frag_A1_bias);
+            ++iterator_A1_bias;
+        }
+
+        this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1);
+
+        if(PerChannelScale)
+          warp_tile_iterator_A1_scale_.load(warp_frag_A1_scale[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_bias_.load(warp_frag_A1_bias[(warp_mma_k + 1) % 2]);
+        warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2],
+            warp_frag_A1_scale[(warp_mma_k + 1) % 2],
+            warp_frag_A1_bias[(warp_mma_k + 1) % 2],
+            output_op_0);
+        this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]);
+
+        if(PerChannelScale)
+          ++warp_tile_iterator_A1_scale_;
+        ++warp_tile_iterator_A1_bias_;
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B1_;
+
+        if (warp_mma_k == 0) {
+
+          iterator_B1.load(tb_frag_B1);
+          ++iterator_B1;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_B1.clear_mask(gemm_k_iterations_1 <= 2);
+        }
+
+        warp_mma1(accum, warp_frag_A1[warp_mma_k % 2],
+                  warp_frag_B1[warp_mma_k % 2], accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
new file mode 100644
index 000000000..958cc8843
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h
@@ -0,0 +1,858 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+* Customized version of Cutlass 3.1 cutlass/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+* removed problematic specialization of DefaultIteratorsTensorOp for fp16 -> fp32 accumulation
+* which had numeric issues due to the usage of SharedLoadIteratorMixed.
+* Introduces the cutlass::epilogue::threadblock::classic_b2b_bmm namespace, which is a customized
+* variant of the cutlass::epilogue::threadblock namespace.
+*
+**************************************************************************************************/
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+
+#include "cutlass/layout/permute.h"
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+namespace classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <
+  typename ElementOutput,
+  typename ElementAccumulator,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    ElementAccumulator,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    ElementAccumulator
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= float x 4
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, float, 4, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int32_t <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<int32_t, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float <= int32_t
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<float, int32_t, ElementsPerAccess, ThreadblockShape, WarpShape, InstructionShape, ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+
+/*
+/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  float,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+
+};
+*/
+
+/// Partial specialization for half <= int32_t x 8 epilogues avoids shared memory bank conflicts.
+template <
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  half_t,
+  int32_t,
+  8,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    16,
+    8,
+    8
+  >;
+
+  static int const kFragmentsPerIteration = 2;
+};
+
+/// Partial specialization for int8/int4b_t <= int32 x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  typename ElementOutput,
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  ElementOutput,
+  int32_t,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  static_assert(platform::is_same<ElementOutput, cutlass::int4b_t>::value ||
+                platform::is_same<ElementOutput, cutlass::uint4b_t>::value ||
+                platform::is_same<ElementOutput, int8_t>::value ||
+                platform::is_same<ElementOutput, uint8_t>::value,
+                "ElementOutput needs to be 4 or 8 bit (unsigned) int.");
+
+   static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+                "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    int32_t,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    int32_t,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    int32_t
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e4m3_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e4m3_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e4m3_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+/// Partial specialization for float_e5m2_t <= float x 16/8 epilogues avoids shared memory bank conflicts.
+/// Threadblock::kN = 256 still has bank conflicts.
+template <
+  int ElementsPerAccess,
+  typename ThreadblockShape,
+  typename WarpShape,
+  typename InstructionShape,
+  typename ThreadMap
+>
+struct DefaultIteratorsTensorOp<
+  cutlass::float_e5m2_t,
+  float,
+  ElementsPerAccess,
+  ThreadblockShape,
+  WarpShape,
+  InstructionShape,
+  ThreadMap> {
+
+  using ElementOutput = cutlass::float_e5m2_t;
+
+  static_assert((ElementsPerAccess == 16 || ElementsPerAccess == 8),
+              "ElementsPerAccess needs to be 16 or 8.");
+
+  using WarpTileIteratorMixed = cutlass::epilogue::warp::TileIteratorTensorOpMixed<
+    WarpShape,
+    InstructionShape,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using WarpTileIteratorNotMixed =  cutlass::epilogue::warp::TileIteratorTensorOp<
+    WarpShape,
+    InstructionShape,
+    float,
+    layout::RowMajor
+  >;
+
+  using WarpTileIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             WarpTileIteratorNotMixed,
+                             WarpTileIteratorMixed>::type;
+
+  using SharedLoadIteratorMixed = cutlass::epilogue::threadblock::SharedLoadIteratorMixed<
+    ThreadMap,
+    float,
+    32,
+    cutlass::sizeof_bits<ElementOutput>::value,
+    ElementsPerAccess,
+    8
+  >;
+
+  using SharedLoadIteratorNotMixed = cutlass::epilogue::threadblock::SharedLoadIterator<
+    ThreadMap,
+    float
+  >;
+
+  using SharedLoadIterator = typename platform::conditional<
+                             (ThreadblockShape::kN == 256) || (ThreadblockShape::kN == 128 && ElementsPerAccess == 8),
+                             SharedLoadIteratorNotMixed,
+                             SharedLoadIteratorMixed>::type;
+
+  static int const kFragmentsPerIteration = 1;
+};
+
+} // namespace detail
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultEpilogueTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpStridedDgrad {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorStridedDgrad<
+    OutputTileThreadMap,
+    ElementOutput
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  int Rank,
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess
+>
+struct DefaultEpilogueTensorOpAffineRankN {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorAffineRankN<
+    OutputTileThreadMap,
+    ElementOutput,
+    Rank
+  >;
+
+  // Map to the row major iterator since the iterator selection for affineN is the same.
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor>,
+                                    cutlass::epilogue::warp::FragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        layout::RowMajor> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementAccumulator,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::Epilogue<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedEpilogueTensorOp {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          LayoutC>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps which uses
+/// intereleaved output layout. For this case, shared memory is not needed.
+template <typename Shape_, typename WarpMmaTensorOp_, int PartitionsK,
+          typename OutputOp_, int ElementsPerAccess, int InterleavedK,
+          bool isSplitK = false>
+struct DefaultInterleavedConvEpilogue {
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::
+      DefaultInterleavedConvThreadMapTensorOp<
+          Shape, typename WarpMmaTensorOp::Shape, kPartitionsK, ElementOutput,
+          kElementsPerAccess, InterleavedK>::Type;
+
+  using OutputTileIterator =
+      cutlass::epilogue::threadblock::InterleavedConvPredicatedTileIterator<
+          OutputTileThreadMap, ElementOutput, InterleavedK>;
+
+  using AccumulatorFragmentIterator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          typename WarpMmaTensorOp::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::Shape,
+          typename WarpMmaTensorOp::Policy::Operator::ElementC,
+          typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+          // can reuse the gemm version here to do element selection
+          layout::ColumnMajorInterleaved<InterleavedK>>;
+
+  //
+  // Define the epilogue
+  //
+  using Epilogue = cutlass::epilogue::threadblock::InterleavedEpilogue<
+      Shape, WarpMmaTensorOp, kPartitionsK, OutputTileIterator,
+      AccumulatorFragmentIterator, OutputOp, InterleavedK>;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+} // namespace classic_b2b_bmm
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
new file mode 100644
index 000000000..4c15aa7e0
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_b2b_mma.h
@@ -0,0 +1,383 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K.
+
+    This file is copied from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h"
+#include "cutlass/transform/threadblock/predicated_vector_access_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "cutlass/transform/warp/vector_fragment_iterator.h"
+
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_pipelined.h"
+#include "grouped_classic_b2b_bmm/threadblock/b2b_mma_multistage.h"
+#include "non_predicated_tile_access_iterator.h"
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1_,
+    /// Element type for C matrix
+    typename ElementC_,
+    /// Element type for internal accumulation
+    typename ElementAccumulator_,
+    /// Layout type for C and D matrix operands
+    typename LayoutC_,
+    /// Operator class tag
+    typename OperatorClass_,
+    /// Tag indicating architecture to tune for
+    typename ArchTag_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0_,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0_,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1_,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape_,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp,
+    /// Store the accumulators in row major or column major.  Row major is used
+    /// when output layout is interleaved.
+    bool AccumulatorsInRowMajor = false,
+    /// Staging the accumulators in shared memory.
+    bool SmemAccumulator = false>
+struct DefaultB2bMma;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output with 2-stage pipeline
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type forAC matrix operand
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, 2, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor,
+      arch::OpClassTensorOp, 2, Operator>;
+
+  // Define iterators over tiles from the A operand
+  using IteratorA0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kM, MmaCore0::Shape::kK>,
+          ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB0 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore0::Shape::kK, MmaCore0::Shape::kN>,
+          ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp>;
+
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+  // Define iterators over tiles from the B operand
+  using IteratorB1 =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          cutlass::MatrixShape<MmaCore1::Shape::kK, MmaCore1::Shape::kN>,
+          ElementB, LayoutB1, 0, typename MmaCore1::IteratorThreadMapB, kAlignmentB>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      IteratorB0, typename MmaCore0::SmemIteratorB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>;
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// Specialization for row-major output for multi-stage
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Layout type for B1 matrix operand
+    typename LayoutB1,
+    /// Element type for output
+    typename ElementC,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape0,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape1,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape0,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape1,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the multistage mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    /// Apply upper triangular causal mask after first gemm
+    bool CausalMaskAfterGemm0,
+    /// Epilogue output operator
+    typename EpilogueOutputOp>
+struct DefaultB2bMma<ElementA, LayoutA, kAlignmentA, ElementB, LayoutB,
+                  kAlignmentB, LayoutB1, ElementC, ElementAccumulator, layout::RowMajor,
+                  arch::OpClassTensorOp, ArchTag,
+                  ThreadblockShape0, ThreadblockShape1,
+                  WarpShape0, WarpShape1,
+                  InstructionShape, Stages, Operator, CausalMaskAfterGemm0, EpilogueOutputOp, false> {
+  static cutlass::arch::CacheOperation::Kind const CacheOpA =
+      ((sizeof_bits<ElementA>::value * kAlignmentA) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+  static cutlass::arch::CacheOperation::Kind const CacheOpB =
+      ((sizeof_bits<ElementB>::value * kAlignmentB) == 128)
+          ? cutlass::arch::CacheOperation::Global
+          : cutlass::arch::CacheOperation::Always;
+
+
+  // Define the MmaCore components
+  using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+  using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore<
+      ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA,
+      ElementB, LayoutB1, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp,
+      Stages, Operator, false, CacheOpA, CacheOpB>;
+
+  // Define iterators over tiles from the A operand
+  using ThreadMapA0 = typename MmaCore0::IteratorThreadMapA;
+  using AccessTypeA0 = cutlass::Array<ElementA, kAlignmentA>;
+  using IteratorA0 =
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kK>,
+          ElementA, LayoutA, 1, ThreadMapA0, AccessTypeA0>;
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB0 = typename MmaCore0::IteratorThreadMapB;
+  using AccessTypeB0 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB0 =
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kK, ThreadblockShape0::kN>,
+          ElementB, LayoutB, 0, ThreadMapB0, AccessTypeB0>;
+
+  // Use fragment iterator for A operand
+  using AccumulatorLayout = cutlass::layout::ColumnMajor;
+  // FragmentIteratorA1 should just load A1 fragments from the intermediate
+  // accumulator tile without modification, so LinearCombination is used to
+  // apply a no-op to the accumulator tile.
+  using LinearCombinationOutputOp = cutlass::epilogue::thread::LinearCombination<
+    ElementC,
+    EpilogueOutputOp::kCount,
+    ElementAccumulator,
+    ElementC,
+    cutlass::epilogue::thread::ScaleType::Nothing
+  >;
+  using FragmentIteratorA1 =
+      cutlass::gemm::warp::MmaTensorOpFragmentIterator<
+          cutlass::MatrixShape<MmaCore1::WarpShape::kM, MmaCore1::InstructionShape::kK>, //warp shape
+          cutlass::MatrixShape<MmaCore0::WarpShape::kM, MmaCore0::WarpShape::kN>, //accumulator shape
+          MmaCore1::Shape::kK, //kBlocksColumn
+          ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, LinearCombinationOutputOp>;
+
+  /// Define iterators over tiles from scale/bias vectors
+  using ElementScaleBias = typename EpilogueOutputOp::ElementCompute;
+  using LayoutScaleBias = layout::RowMajor; //vector layout doesn't really matter
+  static int const kElementsPerAccess = 2;
+  using IteratorAccumulatorScaleBias =
+    cutlass::transform::threadblock::VectorIterator<
+      cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape0::kM, ThreadblockShape0::kN>,
+          cutlass::MatrixShape<WarpShape1::kM, WarpShape1::kK>,
+          ElementScaleBias, LayoutScaleBias, kElementsPerAccess>
+    >;
+
+  // Warp-level iterators to load scale and bias vectors
+  using FragmentIteratorA1ScaleBias = cutlass::transform::warp::VectorFragmentIterator<
+      MatrixShape<1, IteratorAccumulatorScaleBias::Fragment::kElements>, ElementScaleBias,
+      LayoutScaleBias, InstructionShape, kElementsPerAccess>;
+
+
+  // Define iterators over tiles from the B operand
+  using ThreadMapB1 = typename MmaCore1::IteratorThreadMapB;
+  using AccessTypeB1 = cutlass::Array<ElementB, kAlignmentB>;
+  using IteratorB1 =
+      cutlass::transform::threadblock::grouped_classic_b2b_bmm::NonPredicatedTileAccessIterator<
+          cutlass::MatrixShape<ThreadblockShape1::kK, ThreadblockShape1::kN>,
+          ElementB, LayoutB1, 0, ThreadMapB1, AccessTypeB1>;
+
+  // Define the threadblock-scoped pipelined matrix multiply
+  using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaMultistage<
+      typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA,
+      MmaCore0::kCacheOpA,
+      IteratorB0, typename MmaCore0::SmemIteratorB, MmaCore0::kCacheOpB,
+      typename MmaCore1::Shape, FragmentIteratorA1,
+      IteratorAccumulatorScaleBias, FragmentIteratorA1ScaleBias,
+      IteratorB1, typename MmaCore1::SmemIteratorB, MmaCore1::kCacheOpB,
+      ElementC, ElementAccumulator, layout::RowMajor,
+      EpilogueOutputOp,
+      typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy, Stages,
+      CausalMaskAfterGemm0, typename MmaCore0::WarpShape>;
+
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
new file mode 100644
index 000000000..c09a7ecfa
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/default_gmem_to_accum_loader_tensor_op.h
@@ -0,0 +1,202 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/default_epilogue_tensor_op.h but
+  modified to use GmemToAccumLoader, GmemToAccumLoaderFragmentIteratorTensorOp, and
+  GmemToAccumLoaderSharedLoadIterator.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+
+#include "cutlass/platform/platform.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/linear_combination_clamp.h"
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/thread/linear_combination_relu0.h"
+#include "cutlass/epilogue/thread/linear_combination_gelu.h"
+#include "cutlass/epilogue/thread/linear_combination_sigmoid.h"
+#include "cutlass/epilogue/thread/linear_combination_hardswish.h"
+#include "cutlass/epilogue/thread/linear_combination_planar_complex.h"
+
+#include "cutlass/epilogue/thread/conversion_op.h"
+#include "cutlass/epilogue/thread/reduction_op.h"
+
+#include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h"
+
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h"
+#include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator.h"
+#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h"
+
+#include "cutlass/epilogue/threadblock/epilogue.h"
+#include "cutlass/epilogue/threadblock/interleaved_epilogue.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+
+#include "cutlass/layout/permute.h"
+
+#include "grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h"
+#include "grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h"
+#include "grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h"
+#include "grouped_classic_b2b_bmm/threadblock/custom_epilogue_tensor_op.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Defines sensible defaults for epilogues for TensorOps.
+template <
+  typename Shape_,
+  typename WarpMmaTensorOp_,
+  int PartitionsK,
+  typename OutputOp_,
+  int ElementsPerAccess,
+  bool ScatterD = false,
+  typename PermuteDLayout = layout::NoPermute
+>
+struct DefaultGmemToAccumLoaderTensorOp {
+
+  using Shape = Shape_;
+  using WarpMmaTensorOp = WarpMmaTensorOp_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputOp = OutputOp_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+
+  using ElementOutput = typename OutputOp::ElementOutput;
+  using LayoutC = typename WarpMmaTensorOp::LayoutC;
+  using ElementAccumulator = typename WarpMmaTensorOp::ElementC;
+
+  //
+  // Thread map
+  //
+
+  using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp<
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    kPartitionsK,
+    ElementOutput,
+    kElementsPerAccess
+  >::Type;
+
+  static bool const UseCUDAStore = platform::is_same<ElementOutput, double>::value;
+
+  using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator<
+    OutputTileThreadMap,
+    ElementOutput,
+    ScatterD,
+    PermuteDLayout,
+    UseCUDAStore
+  >;
+
+  using AccumulatorFragmentIterator = typename platform::conditional<is_complex<ElementOutput>::value,
+                                    cutlass::epilogue::warp::FragmentIteratorComplexTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC>,
+                                    cutlass::epilogue::warp::GmemToAccumLoaderFragmentIteratorTensorOp<
+                                        typename WarpMmaTensorOp::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::Shape,
+                                        typename WarpMmaTensorOp::Policy::Operator::ElementC,
+                                        typename WarpMmaTensorOp::Policy::Operator::FragmentC,
+                                        LayoutC> >::type;
+
+  /// Support several implementations depending on structure of epilogue
+  using DefaultIterators = classic_b2b_bmm::detail::DefaultIteratorsTensorOp<
+    ElementOutput,
+    ElementOutput,
+    kElementsPerAccess,
+    Shape,
+    typename WarpMmaTensorOp::Shape,
+    typename WarpMmaTensorOp::Policy::Operator::Shape,
+    typename OutputTileThreadMap::CompactedThreadMap
+  >;
+
+  using WarpTileIterator = typename DefaultIterators::WarpTileIterator;
+  using SharedLoadIterator = typename cutlass::epilogue::threadblock::GmemToAccumLoaderSharedLoadIterator<
+    typename OutputTileThreadMap::CompactedThreadMap,
+    ElementOutput
+  >;
+
+  /// Hard-coded padding elements added
+  using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits<ElementAccumulator>::value * 4>;
+
+  static int const kFragmentsPerIteration = (kPartitionsK == 1 ? DefaultIterators::kFragmentsPerIteration : 1);
+
+  //
+  // Define the epilogue
+  //
+  using GmemToAccumLoader = cutlass::epilogue::threadblock::GmemToAccumLoader<
+    Shape,
+    WarpMmaTensorOp,
+    kPartitionsK,
+    OutputTileIterator,
+    AccumulatorFragmentIterator,
+    WarpTileIterator,
+    SharedLoadIterator,
+    OutputOp,
+    Padding,
+    kFragmentsPerIteration
+  >;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
new file mode 100644
index 000000000..87d413344
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader.h
@@ -0,0 +1,361 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory to match canonical
+  tensor layouts in global memory. Epilogues support conversion and reduction operations.
+
+  The shared memory resource is time-sliced across warps.
+
+  NOTE: Copied from cutlass/epilogue/threadblock/epilogue.h and modified to essentially
+  inverse the direction of the epilogue. See https://github.com/NVIDIA/cutlass/issues/784
+  for details.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/functional.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/epilogue_base_streamk.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/util/index_sequence.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+  typename Shape_,                          ///< Shape of threadblock tile (concept: GemmShape)
+  typename WarpMmaOperator_,                ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp)
+  int PartitionsK,                          ///< Number of partitions of the K dimension
+  typename OutputTileIterator_,             ///< Tile iterator reading and writing output tensors
+  typename AccumulatorFragmentIterator_,    ///< Fragment iterator selecting accumulators
+  typename WarpTileIterator_,               ///< Warp-scoped tile iterator writing accumulators to SMEM
+  typename SharedLoadIterator_,             ///< Threadblock-scoped tile iterator loading from SMEM
+  typename OutputOp_,                       ///< Output operator
+  typename Padding_,                        ///< Padding added to SMEM allocation to avoid bank conflicts (concept: MatrixShape)
+  int FragmentsPerPartition = 1,            ///< Used to coarsten the epilogue granularity
+  int IterationsUnroll =                    ///< Used to reduce binary size when epilogue op is large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value)
+>
+class GmemToAccumLoader :
+  public EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>,
+  public EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>
+{
+
+public:
+
+  using Base = EpilogueBase<
+    Shape_,
+    typename WarpMmaOperator_::Shape,
+    PartitionsK,
+    AccumulatorFragmentIterator_,
+    WarpTileIterator_,
+    Padding_,
+    FragmentsPerPartition>;
+
+  using BaseStreamK = EpilogueBaseStreamK<
+    Shape_,
+    PartitionsK,
+    WarpMmaOperator_,
+    AccumulatorFragmentIterator_>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// Number of warps per block
+  using WarpCount = typename Base::WarpCount;
+
+  /// Number of threads per block
+  static int const kBlockThreads = 32 * WarpCount::kCount;
+
+  /// Per-thread accumulator tile type
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Numerical accumulation element type
+  using ElementAccumulator = typename WarpMmaOperator::ElementC;
+
+  /// Fragment type used by the accumulator tile's fragment iterator
+  using AccumulatorFragment = typename AccumulatorFragmentIterator::Fragment;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef = typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Vector type used by the global output iterator
+  using OutputAccessType = Array<
+    typename OutputTileIterator::Element, OutputTileIterator::kElementsPerAccess>;
+
+  /// Vector type used by the shared output iterator
+  using AccumulatorAccessType = Array<ElementAccumulator, OutputTileIterator::kElementsPerAccess>;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK;
+
+  static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+public:
+
+
+  static_assert(SharedLoadIterator::Fragment::kElements == OutputTileIterator::Fragment::kElements,
+    "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(OutputTileIterator::kElementsPerAccess, "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess),
+    "Divisibility");
+
+  static_assert(kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, "One of these must be exactly 1.");
+
+  static_assert(kPartitionsK == 1, "Must be exactly 1.");
+
+private:
+
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+  /// Thread index in the threadblock
+  int thread_idx;
+
+  /// Warp index in the threadblock
+  int warp_idx;
+
+public:
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoader(
+      typename Base::SharedStorage &shared_storage,   ///< Shared storage object
+      int thread_idx,                                 ///< ID of a thread within the threadblock
+      int warp_idx,                                   ///< ID of warp within threadblock
+      int lane_idx)                                   ///< Id of thread within warp
+  :
+      Base(shared_storage, thread_idx, warp_idx, lane_idx),
+      BaseStreamK(thread_idx),
+      shared_load_iterator_(shared_storage.reference(), thread_idx),
+      thread_idx(thread_idx),
+      warp_idx(warp_idx)
+  {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+    OutputOp const &output_op,                ///< Output operator
+    AccumulatorTile &accumulators,            ///< Complete warp-level accumulator tile
+    OutputTileIterator source_iterator )      ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
+  {
+    if (!output_op.is_source_needed())
+    {
+      source_iterator.clear_mask();
+      __syncthreads();  // Dummy (CUDA 11.0)
+    }
+
+    // Source-fragment data (zero-initialized for scenarios where the
+    // output operator allows us to skip loading it from global input)
+    typename OutputTileIterator::Fragment source_fragment;
+    source_fragment.clear();
+
+    // Iterator over warp-level accumulator fragment
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+    #pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; iter += Base::kFragmentsPerIteration)
+    {
+
+      //
+      // Load fragments from shared memory
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+        // Load addend source fragment from global memory to aligned register fragment.
+        source_iterator.load(source_fragment);
+        ++source_iterator;
+
+        // Store data in register fragment to shared memory.
+        shared_load_iterator_.store(source_fragment);
+
+        if (p < Base::kFragmentsPerIteration - 1)
+        {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p)
+      {
+
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+        typename OutputTileIterator::Fragment source_accum_fragment;
+        typename AccumulatorFragmentIterator::Fragment output_accum_fragment;
+
+        // Load from shared memory to "unaligned" accumulator fragment.
+        this->warp_tile_iterator_.load(source_accum_fragment);
+
+        // Load from accumulators to accumulator fragment.
+        accum_fragment_iterator.load(accum_fragment);
+
+        // Store result of computation to accumulators.
+        apply_output_operator(output_accum_fragment, output_op, accum_fragment, source_accum_fragment);
+        accum_fragment_iterator.store(output_accum_fragment);
+
+        ++accum_fragment_iterator;
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset);
+        }
+
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        this->warp_tile_iterator_.add_pointer_offset(kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+
+    }
+
+  }
+
+private:
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator(
+    typename AccumulatorFragmentIterator::Fragment &output_fragment,
+    OutputOp const &output_op,                    ///< Output operator
+    typename AccumulatorFragmentIterator::Fragment const &accum_fragment,
+    typename OutputTileIterator::Fragment const &source_fragment)
+  {
+
+    AccumulatorAccessType *output_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType *>(&output_fragment);
+
+    AccumulatorAccessType const *compute_frag_ptr =
+      reinterpret_cast<AccumulatorAccessType const *>(&accum_fragment);
+
+    OutputAccessType const *source_frag_ptr =
+      reinterpret_cast<OutputAccessType const *>(&source_fragment);
+
+    int const kOutputOpIterations =
+      AccumulatorFragmentIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess;
+    NumericArrayConverter<typename AccumulatorAccessType::Element, typename OutputAccessType::Element, OutputOp::kCount, OutputOp::kRound> converter;
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i)
+    {
+      // Call the output operator
+      output_frag_ptr[i] = converter(output_op(compute_frag_ptr[i], source_frag_ptr[i]));
+    }
+  }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
new file mode 100644
index 000000000..f5ecb1bc7
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/gmem_to_accum_loader_shared_load_iterator.h
@@ -0,0 +1,274 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This file is adapted from https://github.com/NVIDIA/cutlass/tree/master/examples/13_two_tensor_op_fusion.
+
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load output tile from shared memory in
+/// GmemToAccumLoader.
+///
+/// Satisfies: ReadableTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    int MaxAlignment =
+        ThreadMap_::kElementsPerAccess* sizeof_bits<Element_>::value / 8>
+class GmemToAccumLoaderSharedLoadIterator {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::TileShape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+
+  static int const kMinAlignment =
+      ThreadMap_::kElementsPerAccess * sizeof_bits<Element_>::value / 8;
+
+  static int const kAlignment =
+      (MaxAlignment < kMinAlignment ? MaxAlignment : kMinAlignment);
+
+  static int const kThreads = ThreadMap::kThreads;
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType =
+      AlignedArray<Element, ThreadMap::kElementsPerAccess, kAlignment>;
+
+  /// Vector type used for SMEM loads
+  using LoadType = AlignedArray<
+      Element,
+      const_min(
+          128 / sizeof_bits<Element>::value,
+          ThreadMap::kElementsPerAccess),
+      const_min(16, kAlignment)>;
+
+  static int const kLoadsPerAccess =
+      AccessType::kElements / LoadType::kElements;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Stride along adjacent rows
+  int stride_;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  GmemToAccumLoaderSharedLoadIterator(TensorRef ref, int thread_idx)
+      : byte_pointer_(reinterpret_cast<uint8_t*>(ref.data())),
+        stride_((ref.stride(0) * sizeof_bits<Element>::value) / 8) {
+    TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx);
+
+    // Initialize pointer
+    byte_pointer_ += thread_offset.row() * stride_ +
+        thread_offset.column() * sizeof(AccessType) / kElementsPerAccess;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& offset) {
+    byte_pointer_ += offset.row() * Shape::kRow * stride_ +
+        offset.column() * Shape::kColumn * sizeof_bits<Element>::value / 8;
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) const {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t const* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType* frag_ptr = reinterpret_cast<LoadType*>(&frag);
+          LoadType const* memory_pointer =
+              reinterpret_cast<LoadType const*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer
+                  [(column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                       kLoadsPerAccess +
+                   v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void set_smem_base_address(Index address) {}
+
+  /// Loads a fragment
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Stores a fragment from memory.
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(const Fragment& frag, Index pointer_offset) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          uint8_t* byte_pointer = byte_pointer_ +
+              row * ThreadMap::Delta::kRow * stride_ +
+              group * ThreadMap::Delta::kGroup * stride_ +
+              cluster * ThreadMap::Delta::kCluster * stride_ +
+              pointer_offset * sizeof_bits<Element>::value / 8;
+
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          LoadType const* frag_ptr = reinterpret_cast<LoadType const*>(&frag);
+          LoadType* memory_pointer = reinterpret_cast<LoadType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx =
+                frag_row_idx * ThreadMap::Iterations::kColumn + column;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int v = 0; v < kLoadsPerAccess; ++v) {
+              int memory_pointer_idx =
+                  (column * ThreadMap::Delta::kColumn / kElementsPerAccess) *
+                      kLoadsPerAccess +
+                  v;
+              memory_pointer[memory_pointer_idx] =
+                  frag_ptr[frag_idx * kLoadsPerAccess + v];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /// Stores a fragment
+  CUTLASS_DEVICE
+  void store(const Fragment& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h
new file mode 100644
index 000000000..1a2f74b2f
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/threadblock/non_predicated_tile_access_iterator.h
@@ -0,0 +1,1965 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile this
+    iterator visits maybe partial, then the remaining tiles are complete. So, we
+    only need to compute the predicates twice, once before the first tile and
+    once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/permute.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+namespace grouped_classic_b2b_bmm {
+////////////////////////////////////////////////////////////////////////////////
+
+/// NonPredicatedTileAccessIteratorPredicates
+///
+template <typename Shape_, typename Element_, typename Layout_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIteratorPredicates {
+ public:
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorCoord = typename Layout::TensorCoord;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static int const kPredicatesPerByte = 4;
+  static int const kPredicatesPerWord = 4 * kPredicatesPerByte;
+
+  static int const kPredicateCount = ThreadMap::Iterations::kCount * kAccessesPerVector;
+
+  /// Number of 32b words containing predicates
+  static int const kPredicateByteCount =
+    (kPredicateCount + kPredicatesPerByte - 1) / kPredicatesPerByte;
+  static int const kPredicateWordCount = (kPredicateByteCount + 3) / 4;
+
+  static unsigned const kPredicateMask = (1u << kPredicatesPerByte) - 1u;
+
+  static_assert(kPredicateWordCount <= 4, "Too many predicates.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = Array<uint32_t, kPredicateWordCount>;
+
+// private:
+
+  /// Size of tensor
+  TensorCoord extent_;
+
+  /// Initial offset for each thread
+  TensorCoord thread_offset_;
+
+  /// Offset to the first steady-state tile
+  TensorCoord residue_offset_;
+
+  /// Iteration along vectors implied by the thread map
+  int iteration_vector_;
+
+  /// Iteration in the contiguous dimension
+  int iteration_contiguous_;
+
+  /// Iteration in the strided dimension
+  int iteration_strided_;
+
+
+ public:
+
+  CUTLASS_HOST_DEVICE
+  void set_predicates(int thread_id, TensorCoord const &threadblock_offset) {
+
+    TensorCoord residue_extent;
+    if (kAdvanceRank) {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.strided()) % Shape::kStrided;
+      if (!residue_size) {
+        residue_size = Shape::kStrided;
+      }
+
+      residue_offset_ = make_Coord(0, residue_size);
+      residue_extent = make_Coord(
+        extent_.contiguous(),
+        min(threadblock_offset.strided() + residue_size, extent_.strided())
+      );
+    } else {
+
+      typename TensorCoord::Index residue_size = (extent_[kAdvanceRank] - threadblock_offset.contiguous()) % Shape::kContiguous;
+      if (!residue_size) {
+        residue_size = Shape::kContiguous;
+      }
+
+      residue_offset_ = make_Coord(residue_size, 0);
+
+      residue_extent = make_Coord(
+        min(extent_.contiguous(), threadblock_offset.contiguous() + residue_size),
+        extent_.strided()
+      );
+    }
+
+    // Per-thread offset in logical coordinates of tensor
+    thread_offset_ = threadblock_offset + ThreadMap::initial_offset(thread_id);
+
+
+    set_iteration_index(0);
+  }
+
+  /// Default constructor
+  NonPredicatedTileAccessIteratorPredicates() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIteratorPredicates(
+      /// Extent of tensor
+      TensorCoord extent)
+      : extent_(extent) {
+	}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+
+    iteration_vector_ = index % kAccessesPerVector;
+    int residual_access = index / kAccessesPerVector;
+
+    iteration_contiguous_ = residual_access % ThreadMap::Iterations::kContiguous;
+    iteration_strided_ = residual_access / ThreadMap::Iterations::kContiguous;
+
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIteratorPredicates &operator++() {
+
+    return *this;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// NonPredicatedTileAccessIterator
+///
+template <typename Shape, typename Element, typename Layout, int AdvanceRank,
+          typename ThreadMap, typename AccessType, bool Gather = false,
+          typename PermuteLayout = layout::NoPermute>
+class NonPredicatedTileAccessIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for pitch-linear data.
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::PitchLinear,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = NonPredicatedTileAccessIteratorPredicates<
+      Shape, Element, Layout, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  static bool constexpr Permute = !platform::is_same<PermuteLayout, layout::NoPermute>::value
+                               && !platform::is_same<PermuteLayout, layout::InversePermute<layout::NoPermute>>::value;
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+
+    using Base = PredicatedTileAccessIteratorParams;
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) :
+      Base(layout.stride(0),
+            MakePredicatedTileAccessIteratorDesc<Shape, Element, Layout, kAdvanceRank, ThreadMap>()()
+        ) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const &base) :
+      Base(base) { }
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+
+  /// Gather indices
+  int const *indices_;
+
+  /// Function to perform layout permutation and offset computation
+  PermuteLayout permute_layout_;
+
+  /// Tracks thread's coordinate offset in the matrix for current tile.
+  /// This is only used in the following cases:
+  /// - when Gather is true, strided coordinate needed to access indices (contiguous offset is tracked via pointer_)
+  /// - when Permute is true, both coordinates are neeeded as input into permutation function (pointer_ is fixed)
+  TensorCoord coord_offset_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : params_(params),
+	      pointer_(reinterpret_cast<BytePointer>(
+                 const_cast<NonConstPointer>(pointer))),
+	      the_predicates(extent),
+        is_residue_tile_(true),
+        indices_(indices),
+        permute_layout_(TensorCoord(extent.contiguous(), extent.strided()), params.stride_) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    if (Gather) {
+      assert(indices_);
+    }
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather && !Permute) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      coord_offset_ = the_predicates.thread_offset_;
+      if (!Permute) {
+        add_pointer_offset(layout(make_Coord(coord_offset_.contiguous(), 0)));
+      }
+    }
+  }
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(
+      TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+
+      if (!Gather && !Permute) {
+        add_pointer_offset(layout(the_predicates.residue_offset_));
+
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided() - 1);
+          pointer_ += Shape::kContiguous * tile_offset.contiguous() * sizeof_bits<Element>::value / 8;
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous() - 1);
+          pointer_ += Shape::kStrided * tile_offset.strided() * sizeof_bits<Element>::value / 8;
+        }
+      } else {
+        coord_offset_.strided() = the_predicates.thread_offset_.strided() + Shape::kStrided * (tile_offset.strided() - kAdvanceRank);
+        if (!Permute) {
+          add_pointer_offset(layout(make_Coord(the_predicates.residue_offset_.contiguous(), 0)));
+          add_pointer_offset(Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank)));
+        } else {
+          coord_offset_.contiguous() = the_predicates.thread_offset_.contiguous() + Shape::kContiguous * (tile_offset.contiguous() - (1 - kAdvanceRank));
+        }
+      }
+    } else {
+      if (!Gather && !Permute) {
+        if (kAdvanceRank) {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+          pointer_ += Shape::kContiguous * tile_offset.contiguous();
+        } else {
+          pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+          pointer_ += Shape::kStrided * tile_offset.strided();
+        }
+      } else {
+        coord_offset_.strided() += Shape::kStrided * tile_offset.strided();
+        if (!Permute) {
+          add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+        } else {
+          coord_offset_.contiguous() += Shape::kContiguous * tile_offset.contiguous();
+        }
+      }
+    }
+
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+
+    if (Gather || Permute)
+    {
+
+      Index coord_contig  = (Permute ? coord_offset_.contiguous() : 0) + the_predicates.iteration_contiguous_ * ThreadMap::Delta::kContiguous + the_predicates.iteration_vector_ * AccessType::kElements;
+      Index coord_strided = coord_offset_.strided() + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+      if (Gather) {
+        coord_strided = indices_[coord_strided];
+      }
+
+      LongIndex offset = Permute ? permute_layout_(TensorCoord(coord_contig, coord_strided)) : (coord_strided * LongIndex(params_.stride_) + coord_contig);
+      return reinterpret_cast<AccessType *>(pointer_ + OffsetBytes<Element>(offset));
+    }
+
+    return reinterpret_cast<AccessType *>(
+        pointer_ +
+        the_predicates.iteration_contiguous_ * (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value) / 8) + the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ == ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather && !Permute) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather && !Permute) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced, this
+      // subtraction as well as the subsequent integer addition are both elided by
+      // the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) {
+    the_predicates.get_mask(mask);
+  }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column()),
+                  indices) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, bool Gather,
+          typename PermuteLayout>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, Gather,
+                                   PermuteLayout> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType,
+      Gather, PermuteLayout>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      /// Gather indices
+      int const *indices = nullptr)
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row()),
+                  indices) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank 2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRankN<2>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingPredicates = NonPredicatedTileAccessIteratorPredicates<
+      Shape, Element, layout::PitchLinear, AdvanceRank, ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(!(ThreadMap::kElementsPerAccess % AccessType::kElements),
+    "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend NonPredicatedTileAccessIterator;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params(): stride_(0), inc_contiguous_(0), inc_strided_(0), inc_next_(0), inc_advance_(0) { }
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout) : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ = (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+                     sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ =
+            Shape::kStrided * LongIndex(stride_[1]) * sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ = Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ - LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char *;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+
+  /// Used for out-of-order visitation
+  bool is_residue_tile_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+	is_residue_tile_(true) {
+
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { the_predicates.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    if (is_residue_tile_) {
+
+      the_predicates.thread_offset_ += the_predicates.residue_offset_;
+
+      Layout layout(params_.stride_);
+      add_pointer_offset(layout(the_predicates.residue_offset_));
+
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1] - 1);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0] - 1);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    } else {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+        pointer_ += Shape::kContiguous * tile_offset[0];
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+        pointer_ += Shape::kStrided * tile_offset[1];
+      }
+    }
+    is_residue_tile_ = false;
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(pointer_) + the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ < ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { the_predicates.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { the_predicates.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { the_predicates.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { the_predicates.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank 2 column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2ColumnMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 0 : 1), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row(), extent.column()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.row(),
+                                           threadblock_offset.column())) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for affine rank-2 row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_>
+class NonPredicatedTileAccessIterator<Shape_, Element_, layout::AffineRank2RowMajor,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>, Element,
+      layout::AffineRankN<2>, (kAdvanceRank == 0 ? 1 : 0), ThreadMap, AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      ///< Precomputed parameters object
+      Params const &params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column(), extent.row()),
+                  thread_id,
+                  layout::PitchLinearCoord(threadblock_offset.column(),
+                                           threadblock_offset.row())) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset(make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for column-major interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class NonPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::ColumnMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kRow * kInterleavedK,
+                               Shape::kColumn / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 0 : 1), ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.row() * kInterleavedK,
+                                           extent.column() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.row() * kInterleavedK,
+                      threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of NonPredicatedTileAccessIterator for row-major interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <typename Shape_, typename Element_, int AdvanceRank,
+          typename ThreadMap_, typename AccessType_, int InterleavedK>
+class NonPredicatedTileAccessIterator<Shape_, Element_,
+                                   layout::RowMajorInterleaved<InterleavedK>,
+                                   AdvanceRank, ThreadMap_, AccessType_, false,
+                                   layout::NoPermute> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element *;
+  using NonConstPointer = typename platform::remove_const<Element>::type *;
+
+  using UnderlyingIterator = NonPredicatedTileAccessIterator<
+      layout::PitchLinearShape<Shape::kColumn * kInterleavedK,
+                               Shape::kRow / kInterleavedK>,
+      Element, layout::PitchLinear, (kAdvanceRank == 0 ? 1 : 0), ThreadMap,
+      AccessType>;
+
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend NonPredicatedTileAccessIterator;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+
+    /// Default constructor
+    Params() = default;
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const &layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const &base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+
+  /// Default constructor
+  NonPredicatedTileAccessIterator() = default;
+
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      /// Precomputed parameters object
+      Params const &params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const &threadblock_offset,
+      int const *indices = nullptr     ///< gather/scatter indices, note no support for gather/scatter at this specialization
+      )
+      : iterator_(params.params_, pointer,
+                  layout::PitchLinearCoord(extent.column() * kInterleavedK,
+                                           extent.row() / kInterleavedK),
+                  thread_id,
+                  layout::PitchLinearCoord(
+                      threadblock_offset.column() * kInterleavedK,
+                      threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a NonPredicatedTileAccessIterator with zero threadblock offset
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator(
+      Params const &params,  ///< Precomputed parameters object
+      Pointer pointer,       ///< Pointer to start of tensor
+      TensorCoord extent,    ///< Extent of tensor
+      int thread_id          ///< ID of each participating thread
+      )
+      : NonPredicatedTileAccessIterator(params, pointer, extent, thread_id,
+                                     make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) { iterator_.set_iteration_index(index); }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const &tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType *get() const {
+    return reinterpret_cast<AccessType *>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator &operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  NonPredicatedTileAccessIterator operator++(int) {
+    NonPredicatedTileAccessIterator self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) { iterator_.clear_mask(enable); }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() { iterator_.enable_mask(); }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const &mask) { iterator_.set_mask(mask); }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask &mask) { iterator_.get_mask(mask); }
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+}  // namespace grouped_classic_b2b_bmm
+}  // namespace threadblock
+}  // namespace transform
+}  // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h b/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
new file mode 100644
index 000000000..3fb47da01
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/warp/gmem_to_accum_loader_fragment_iterator_tensor_op.h
@@ -0,0 +1,315 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile
+      that participate in one warp-level store operation.
+
+      Typically, the accumulator tile is the largest single block of register-backed storage
+      within the kernel. Storing it to memory is best accomplished by partitioning it into
+      smaller tiles and storing these sequentially.
+
+      Round trips through shared memory during the Epilogue phase require partitioning, as
+      shared memory capacity is typically insufficient for a threadblock's total accumulator
+      size.
+
+      NOTE: Copied from cutlass/epilogue/warp/fragment_iterator_tensor_op.h but modified
+      to make the accumulators non-const type so the accumulators can be modified.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/epilogue/warp/tensor_op_policy.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace warp {
+
+////////////////////////////////////////////////////////////////////////////////
+
+///
+template <
+  typename WarpShape,         ///< shape of warp-level GEMM (concept: MatrixShape)
+  typename OperatorShape,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array)
+  typename Layout             ///< target shared memory layout
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Partial specialization for row-major shared memory
+template <
+  typename WarpShape_,         ///< shape of the warp-level GEMM tile
+  typename OperatorShape_,     ///< matrix multiply operation shape (concept: gemm::GemmShape)
+  typename OperatorElementC_,  ///< matrix multiply operation data type (concept: data type)
+  typename OperatorFragmentC_  ///< matrix multiply operation fragment (concept: Array)
+>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_, layout::RowMajor> {
+public:
+
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  using Layout = layout::RowMajor;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<
+    OperatorElementC,
+    Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile = Array<
+    OperatorElementC,
+    OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn>;
+
+  using OutputAccumulatorTile = AccumulatorTile;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+private:
+
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType *accumulators_;
+
+  /// Internal index
+  int index_;
+
+public:
+
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile &accum):
+    accumulators_(reinterpret_cast<AccessType *>(&accum)),
+    index_(0) {
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+
+      int accumulator_access_offset =
+        index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+
+  /// Stores a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void store(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = frag_ptr[n];
+    }
+  }
+
+  /// Adds a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void add(Fragment& frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) {
+      int accumulator_access_offset = index +
+          n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess;
+
+      accumulators_[accumulator_access_offset] = accumulators_[accumulator_access_offset] + frag_ptr[n];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Dedicated to interleaved layout
+template <
+    /// shape of the warp-level GEMM tile
+    typename WarpShape_,
+    /// matrix multiply operator shape (concept: gemm::GemmShape)
+    typename OperatorShape_,
+    /// matrix multiply operator data type (concept: data type)
+    typename OperatorElementC_,
+    /// matrix multiply operator fragment (concept: Array)
+    typename OperatorFragmentC_,
+    /// number of interleaved k
+    int InterleavedK>
+class GmemToAccumLoaderFragmentIteratorTensorOp<WarpShape_, OperatorShape_, OperatorElementC_, OperatorFragmentC_,
+                               layout::ColumnMajorInterleaved<InterleavedK>> {
+ public:
+  using WarpShape = WarpShape_;
+  using OperatorShape = OperatorShape_;
+  using OperatorElementC = OperatorElementC_;
+  using OperatorFragmentC = OperatorFragmentC_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+
+  using Policy = TensorOpPolicy<WarpShape, OperatorShape, Layout>;
+
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment =
+      Array<OperatorElementC,
+            Policy::kElementsPerAccess * InterleavedK / OperatorShape::kN>;
+
+  /// This is the complete warp-level accumulator tile.
+  using AccumulatorTile =
+      Array<OperatorElementC, OperatorFragmentC::kElements *
+                                  Policy::OperatorCount::kRow *
+                                  Policy::OperatorCount::kColumn>;
+
+  /// Number of times this iterator can be incremented
+  static int const kIterations = Policy::kIterations;
+  using TileIterations = typename Policy::TileIterations;
+  static int const kIterationsPerTile = kIterations / TileIterations::kCount;
+
+ private:
+  /// Internal access type
+  using AccessType =
+      Array<OperatorElementC, Policy::kElementsPerAccess>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Accumulator tile
+  AccessType const *accumulators_;
+
+  /// Internal index
+  int index_;
+
+ public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp(AccumulatorTile const &accum)
+      : accumulators_(reinterpret_cast<AccessType const *>(&accum)),
+        index_(0) {}
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator++() {
+    ++index_;
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  GmemToAccumLoaderFragmentIteratorTensorOp &operator--() {
+    --index_;
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag, int index_offset = 0) const {
+    int index = index_ + index_offset;
+
+    AccessType *frag_ptr = reinterpret_cast<AccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < (InterleavedK / OperatorShape::kN); ++n) {
+      int index_m = index % (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int index_n = index / (Policy::OperatorCount::kRow *
+                             Policy::kIterationsPerInstruction);
+      int accumulator_access_offset =
+          (index_m / Policy::kIterationsPerInstruction) *
+              (Policy::OperatorCount::kColumn *
+               Policy::kIterationsPerInstruction) +
+          (index_m % Policy::kIterationsPerInstruction) +
+          index_n * (InterleavedK / OperatorShape::kN) *
+              Policy::kIterationsPerInstruction +
+          n * Policy::kIterationsPerInstruction;
+
+      frag_ptr[n] = accumulators_[accumulator_access_offset];
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h b/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
new file mode 100644
index 000000000..3fe0ab10b
--- /dev/null
+++ b/static/include/kernels/grouped_classic_b2b_bmm/warp/triu_mma_tensor_op_fragment_iterator.h
@@ -0,0 +1,235 @@
+// @lint-ignore-every LICENSELINT
+// clang-format off
+
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "grouped_classic_b2b_bmm/thread/linear_combination_triu.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Modified version of MmaTensorOpFragmentIterator that can zero out upper triangular
+// portion of output matrix.
+template <typename MmaTensorOpFragmentIterator_, int ThreadBlockShapeM_>
+class TriuMmaTensorOpFragmentIterator {
+ public:
+
+  /// Shape of warp tile to load (concept: MatrixShape)
+  using Shape = typename MmaTensorOpFragmentIterator_::Shape;
+
+  /// Shape of the warp accumulation tile (concept: MatrixShape)
+  using AccumulatorShape = typename MmaTensorOpFragmentIterator_::AccumulatorShape;
+
+  /// KBlocks columns to compute residual
+  static int const kKBlockColumn = MmaTensorOpFragmentIterator_::kKBlockColumn;
+
+  /// Accumulator Element type
+  using ElementAccumulator = typename MmaTensorOpFragmentIterator_::ElementAccumulator;
+
+  /// Element type
+  using Element = typename MmaTensorOpFragmentIterator_::Element;
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::ColumnMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = typename MmaTensorOpFragmentIterator_::InstructionShape;
+
+  /// Output operation on fragment
+  using OutputOp = thread::LinearCombinationTriu<
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    MmaTensorOpFragmentIterator_::OutputOp::kCount,
+    ThreadBlockShapeM_,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementOutput,
+    typename MmaTensorOpFragmentIterator_::OutputOp::ElementCompute
+  >;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// Internal structure of iterator - made public to enable introspection
+  struct Policy {
+    static_assert(
+        !(Shape::kRow % InstructionShape::kM) &&
+            !(Shape::kColumn % InstructionShape::kN),
+        "Shape of warp-level Mma must be divisible by operator shape.");
+    static_assert(
+        AccumulatorShape::kRow == Shape::kRow,
+        "Rows of Warp Accumulator must be the same as rows of warp");
+    static_assert(
+        !(AccumulatorShape::kColumn % Shape::kColumn),
+        "Shape of Warp Accumulator must be divisible by warp shape.");
+    static_assert(
+        !(kKBlockColumn % Shape::kColumn),
+        "KBlock size must be divisible by warp shape.");
+
+    /// Number of times this iterator can be incremented
+    static int const kIterations = AccumulatorShape::kCount / Shape::kCount;
+  };
+
+private:
+
+  static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads;
+
+  /// Number of mma operations performed by a warp
+  using MmaIterations = MatrixShape<Shape::kRow / InstructionShape::kM,
+                                    Shape::kColumn / InstructionShape::kN>;
+  /// Number of mma operations performed by the entire accumulator
+  using AccumulatorIterations = MatrixShape<AccumulatorShape::kRow / InstructionShape::kM,
+                                              AccumulatorShape::kColumn / InstructionShape::kN>;
+
+  /// Number of K iterations
+  static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn;
+  static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn;
+  static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+  static int const kResidualIndex = kResidualColumn / Shape::kColumn
+                                     * (AccumulatorShape::kRow / Shape::kRow);
+
+public:
+
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  /// This is the fragment size produced by one access of the iterator.
+  using Fragment = Array<Element, Shape::kCount / kThreads>;
+
+  /// Accumulator Fragment object
+  using AccumulatorFragment = Array<ElementAccumulator, AccumulatorShape::kCount / kThreads>;
+
+  /// Scale Bias Element Type
+  using ElementScaleBias = typename OutputOp::ElementCompute;
+
+  /// Scale Bias Fragment object
+  using ScaleBiasFragment = Array<ElementScaleBias, InstructionShape::kM * InstructionShape::kK / kThreads>;
+
+
+private:
+
+  /// Internal access type
+  using AccessType = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentAccessType = Array<Element, kElementsPerAccess>;
+
+  using ScaleBiasAccessType = Array<ElementScaleBias, kElementsPerAccess>;
+
+private:
+  //
+  // Data members
+  //
+
+  /// Internal index
+  int index_;
+
+  /// Used to access residual tile first
+  bool is_residual_tile_;
+
+  OutputOp output_op;
+
+public:
+  /// Constructs an iterator
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator()
+      : index_(0), is_residual_tile_(true), output_op() {}
+
+  /// Add offset
+  CUTLASS_HOST_DEVICE
+  void add_offset(int index_offset) {
+    index_ += index_offset;
+    if(is_residual_tile_ && index_ >= kKBlockColumnIterations) {
+      index_ = index_ - kKBlockColumnIterations + kResidualIndex;
+      is_residual_tile_ = false;
+    }
+  }
+
+  /// Increments
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator++() {
+    add_offset(1);
+    return *this;
+  }
+
+  /// Decrements
+  CUTLASS_HOST_DEVICE
+  TriuMmaTensorOpFragmentIterator &operator--() {
+    add_offset(-1);
+    return *this;
+  }
+
+  /// Loads a fragment from the referenced part of the accumulator tile
+  CUTLASS_HOST_DEVICE
+  void load(Fragment &frag) const {
+
+    if (output_op.is_source_needed()) //beta must be zero
+      assert(0);
+
+    FragmentAccessType *frag_ptr = reinterpret_cast<FragmentAccessType *>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int n = 0; n < MmaIterations::kColumn; n++) {
+      for (int m = 0; m < MmaIterations::kRow; m++) {
+        if(!(is_residual_tile_ && index_ >= kResidualIndex)) {
+            frag_ptr[m * MmaIterations::kColumn + n] = output_op(
+              frag_ptr[m * MmaIterations::kColumn + n],
+              index_,
+              n,
+              m
+            );
+        }
+      }
+    }
+  }
+
+};
+
+}
+}
+}
diff --git a/static/include/kernels/kat_printf.h b/static/include/kernels/kat_printf.h
new file mode 100644
index 000000000..f4f122f6e
--- /dev/null
+++ b/static/include/kernels/kat_printf.h
@@ -0,0 +1,1263 @@
+// Single-Header version of printf.cu from the cuda-kat library
+// implementing printf variants and string manipulation code
+// See
+// https://github.com/eyalroz/cuda-kat/blob/development/src/kat/on_device/c_standard_library/printf.cu
+// copied from revision f771b5d5906d0f49e7500d32c2af91234c1cebad
+
+/**
+ * @author (c) Eyal Rozenberg <eyalroz1@gmx.com>
+ *             2021-2022, Haifa, Palestine/Israel
+ * @author (c) Marco Paland (info@paland.com)
+ *             2014-2019, PALANDesign Hannover, Germany
+ *
+ * @note Others have made smaller contributions to this file: see the
+ * contributors page at https://github.com/eyalroz/printf/graphs/contributors
+ * or ask one of the authors. The original code for exponential specifiers was
+ * contributed by Martijn Jasperse <m.jasperse@gmail.com>.
+ *
+ * @brief Small stand-alone implementation of the printf family of functions
+ * (`(v)printf`, `(v)s(n)printf` etc., geared towards use on embedded systems
+ * with a very limited resources.
+ *
+ * @note the implementations are thread-safe; re-entrant; use no functions from
+ * the standard library; and do not dynamically allocate any memory.
+ *
+ * @license The MIT License (MIT)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma once
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstdarg>
+#include <cstdint>
+#include <cstdio> // for CUDA's builtin printf()
+
+namespace kat {
+
+/**
+ * @brief Search for a character within a nul-terminated string.
+ *
+ * @param s The string to search
+ * @param c A character value to search for
+ * @return address of the first character with the value @p c
+ * within string @p s, or nullptr if no character of @p s equals @p c .
+ */
+inline __device__ char* strchr(const char* s, int c) {
+  const char* p = s;
+  do {
+    if (*p == static_cast<char>(c)) {
+      return const_cast<char*>(p);
+    }
+  } while (*(p++) != '\0');
+  return nullptr;
+}
+
+/**
+ * @brief same as @ref std::strchr , except that the search begins
+ * at the end of the string
+ *
+ * @note If @p c is '\0', it _will_ match the nul character
+ * at the end of the string.
+ *
+ */
+inline __device__ char* strrchr(const char* s, int c) {
+  const char* last = nullptr;
+  const char* p = s;
+  do {
+    if (*p == c) {
+      last = p;
+    }
+  } while (*(p++) != '\0');
+  return const_cast<char*>(last);
+}
+
+/**
+ * An implementation of the C standard's snprintf/vsnprintf
+ *
+ * @param s An array in which to store the formatted string. It must be large
+ * enough to fit either the entire formatted output, or at least @p count
+ * characters. Alternatively, it can be `NULL`, in which case nothing will be
+ * printed, and only the number of characters which _could_ have been printed is
+ * tallied and returned.
+ * @param n The maximum number of characters to write to the array, including a
+ * terminating null character.
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each specifier in @p
+ * format
+ * @return The number of characters that COULD have been written into @p s, not
+ * counting the terminating null character. A value equal or larger than @p
+ * count indicates truncation. Only when the returned value is non-negative and
+ * less than @p count, the null-terminated string has been fully and
+ * successfully printed. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int snprintf(
+    char* s,
+    size_t count,
+    const char* format,
+    ...) __attribute__((format(__printf__, (3), (4))));
+__attribute__((device)) int vsnprintf(
+    char* s,
+    size_t count,
+    const char* format,
+    va_list arg) __attribute__((format(__printf__, ((3)), (0))));
+
+/**
+ * An implementation of the C standard's printf/vprintf, via a self-allocated
+ * buffer, backed by CUDA's `printf()`
+ *
+ * @note These functions will allocate some scratch memory to format a string
+ * into, which will then be printed using CUDA's printf. This may be
+ * inconvenient or dangerous, so **use of these function is _not_ recommended.**
+ * Prefer @ref printf_with_scratch or @ref vprintf_with_scratch instead.
+ *
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each %-specifier in
+ * @p format string
+ * @return The number of characters written to the output not counting the
+ * terminating null character.
+ */
+__attribute__((device)) int printf(const char* format, ...)
+    __attribute__((format(__printf__, (1), (2))));
+
+__attribute__((device)) int vprintf(const char* format, va_list arg)
+    __attribute__((format(__printf__, ((1)), (0))));
+
+/**
+ * An implementation of the C standard's printf/vprintf, backed by CUDA's
+ * `printf()`, with a user-provided sized scratch buffer.
+ *
+ * @note These functions will not allocate anything on the heap.
+ *
+ * @param scratch an array for staging the formatted output before passing it to
+ * CUDA's `printf()` function. The buffer must have at least @p count available
+ * bytes. If `nullptr` is passed for `scratch`, nothing is written, but the
+ * number of characters _to_ be written is returned.
+ * @param count size of the @p scratch buffer
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg additional arguments to the function, one for each %-specifier in
+ * @p format string
+ * @return The number of characters that COULD have been written into @p s, not
+ * counting the terminating null character. A value equal or larger than @p
+ * count indicates truncation. Only when the returned value is non-negative and
+ * less than @p count, the null-terminated string has been fully and
+ * successfully printed. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int vnprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    va_list arg) __attribute__((format(__printf__, ((3)), (0))));
+__attribute__((device)) int nprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    ...) __attribute__((format(__printf__, (3), (4))));
+
+/**
+ * An implementation of the C standard's sprintf/vsprintf
+ *
+ * @note For security considerations (the potential for exceeding the buffer
+ * bounds), please consider using the size-constrained variant, @ref
+ * kat::snprintf / @ref kat::vsnprintf , instead.
+ *
+ * @param s An array in which to store the formatted string. It must be large
+ * enough to fit the formatted output!
+ * @param format A string specifying the format of the output, with %-marked
+ * specifiers of how to interpret additional arguments.
+ * @param arg Additional arguments to the function, one for each specifier in @p
+ * format
+ * @return The number of characters written into @p s, not counting the
+ * terminating null character. If `nullptr` was passed as `s`, the number of
+ * _intended_ characters will be returned without any characters being written
+ * anywhere.
+ */
+__attribute__((device)) int sprintf(char* s, const char* format, ...)
+    __attribute__((format(__printf__, (2), (3))));
+__attribute__((device)) int vsprintf(char* s, const char* format, va_list arg)
+    __attribute__((format(__printf__, ((2)), (0))));
+
+} // namespace kat
+
+// ---------------------------------------------------------------------------------------------------------------------
+namespace kat {
+namespace detail_ {
+namespace printf {
+enum {
+  integer_buffer_size = 32,
+  decimal_buffer_size = 32,
+  default_float_precision = 6,
+  num_decimal_digits_in_int64_t = 18,
+  max_supported_precision = num_decimal_digits_in_int64_t - 1,
+};
+constexpr const double float_notation_threshold = 1e9;
+namespace flags {
+static_assert(sizeof(short) == 2, "Unexpected size of short");
+static_assert(sizeof(int) == 4, "Unexpected size of int");
+static_assert(sizeof(long) == 8, "Unexpected size of long");
+enum : unsigned {
+  zeropad = 1U << 0U,
+  left = 1U << 1U,
+  plus = 1U << 2U,
+  space = 1U << 3U,
+  hash = 1U << 4U,
+  uppercase = 1U << 5U,
+  char_ = 1U << 6U,
+  short_ = 1U << 7U,
+  int_ = 1U << 8U,
+  long_ = 1U << 9U,
+  long_long = 1U << 10U,
+  precision = 1U << 11U,
+  adapt_exp = 1U << 12U,
+  pointer = 1U << 13U,
+  signed_ = 1U << 14U,
+  int8 = char_,
+  int16 = short_,
+  int32 = int_,
+  int64 = long_
+};
+} // namespace flags
+typedef unsigned int flags_t;
+namespace base {
+enum { binary = 2, octal = 8, decimal = 10, hex = 16 };
+}
+typedef uint8_t numeric_base_t;
+typedef unsigned long long unsigned_value_t;
+typedef long long signed_value_t;
+typedef unsigned int printf_size_t;
+enum { max_possible_buffer_size = 0x7fffffff };
+namespace double_ {
+static_assert(
+    FLT_RADIX == 2,
+    "Non-binary-radix floating-point types are unsupported.");
+static_assert(DBL_MANT_DIG == 53, "Unsupported double type configuration");
+typedef uint64_t uint_t;
+enum {
+  size_in_bits = 64,
+  base_exponent = 1023,
+  stored_mantissa_bits = DBL_MANT_DIG - 1,
+};
+enum : unsigned { exponent_mask = 0x7FFU };
+union with_bit_access {
+  uint_t U;
+  double F;
+  static __attribute__((device)) constexpr with_bit_access wrap(double x) {
+    with_bit_access dwba = {.F = x};
+    return dwba;
+  }
+  __attribute__((device)) constexpr __attribute__((device)) int exp2() const {
+    return (int)((U >> stored_mantissa_bits) & exponent_mask) - base_exponent;
+  }
+};
+struct components {
+  int_fast64_t integral;
+  int_fast64_t fractional;
+  bool is_negative;
+};
+} // namespace double_
+__attribute__((device)) static inline constexpr int get_sign_bit(double x) {
+  return (
+      int)(double_::with_bit_access::wrap(x).U >> (double_::size_in_bits - 1));
+}
+__attribute__((device)) static inline int get_exp2(double x) {
+  return double_::with_bit_access::wrap(x).exp2();
+}
+template <typename T>
+__attribute__((device)) constexpr T abs(T x) {
+  return x > 0 ? x : -x;
+}
+template <typename T>
+__attribute__((device)) constexpr unsigned_value_t abs_for_printing(T x) {
+  return x > 0 ? x : -(signed_value_t)x;
+}
+typedef struct {
+  void (*function)(char c, void* extra_arg);
+  void* extra_function_arg;
+  char* buffer;
+  printf_size_t pos;
+  printf_size_t max_chars;
+} output_gadget_t;
+__attribute__((noinline)) __attribute__((device)) static inline void
+putchar_via_gadget(output_gadget_t* gadget, char c) {
+  printf_size_t write_pos = gadget->pos++;
+  if (write_pos >= gadget->max_chars) {
+    return;
+  }
+  if (gadget->function != nullptr) {
+    gadget->function(c, gadget->extra_function_arg);
+  } else {
+    gadget->buffer[write_pos] = c;
+  }
+}
+__attribute__((device)) static inline void append_termination_with_gadget(
+    output_gadget_t* gadget) {
+  if (gadget->function != nullptr || gadget->max_chars == 0) {
+    return;
+  }
+  if (gadget->buffer == nullptr) {
+    return;
+  }
+  printf_size_t null_char_pos =
+      gadget->pos < gadget->max_chars ? gadget->pos : gadget->max_chars - 1;
+  gadget->buffer[null_char_pos] = '\0';
+}
+__attribute__((device)) static inline output_gadget_t discarding_gadget() {
+  output_gadget_t gadget;
+  gadget.function = nullptr;
+  gadget.extra_function_arg = nullptr;
+  gadget.buffer = nullptr;
+  gadget.pos = 0;
+  gadget.max_chars = 0;
+  return gadget;
+}
+__attribute__((device)) static inline output_gadget_t buffer_gadget(
+    char* buffer,
+    size_t buffer_size) {
+  printf_size_t usable_buffer_size = (buffer_size > max_possible_buffer_size)
+      ? max_possible_buffer_size
+      : (printf_size_t)buffer_size;
+  output_gadget_t result = discarding_gadget();
+  if (buffer != nullptr) {
+    result.buffer = buffer;
+    result.max_chars = usable_buffer_size;
+  }
+  return result;
+}
+__attribute__((device)) static inline printf_size_t strnlen_s_(
+    const char* str,
+    printf_size_t maxsize) {
+  const char* s;
+  for (s = str; *s && maxsize--; ++s)
+    ;
+  return (printf_size_t)(s - str);
+}
+__attribute__((device)) static inline constexpr bool is_digit_(char ch) {
+  return (ch >= '0') && (ch <= '9');
+}
+__attribute__((device)) static printf_size_t atou_(const char** str) {
+  printf_size_t i = 0U;
+  while (is_digit_(**str)) {
+    i = i * 10U + (printf_size_t)(*((*str)++) - '0');
+  }
+  return i;
+}
+__attribute__((device)) static void out_rev_(
+    output_gadget_t* output,
+    const char* buf,
+    printf_size_t len,
+    printf_size_t width,
+    flags_t flags) {
+  const printf_size_t start_pos = output->pos;
+  if (!(flags & flags::left) && !(flags & flags::zeropad)) {
+    for (printf_size_t i = len; i < width; i++) {
+      putchar_via_gadget(output, ' ');
+    }
+  }
+  while (len) {
+    putchar_via_gadget(output, buf[--len]);
+  }
+  if (flags & flags::left) {
+    while (output->pos - start_pos < width) {
+      putchar_via_gadget(output, ' ');
+    }
+  }
+}
+__attribute__((device)) static void print_integer_finalization(
+    output_gadget_t* __restrict__ output,
+    char* __restrict__ buf,
+    printf_size_t len,
+    bool negative,
+    numeric_base_t base,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags) {
+  printf_size_t unpadded_len = len;
+  {
+    if (!(flags & flags::left)) {
+      if (width && (flags & flags::zeropad) &&
+          (negative || (flags & (flags::plus | flags::space)))) {
+        width--;
+      }
+      while ((flags & flags::zeropad) && (len < width) &&
+             (len < detail_::printf::integer_buffer_size)) {
+        buf[len++] = '0';
+      }
+    }
+    while ((len < precision) && (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = '0';
+    }
+    if (base == base::octal && (len > unpadded_len)) {
+      flags &= ~flags::hash;
+    }
+  }
+  if (flags & (flags::hash | flags::pointer)) {
+    if (!(flags & flags::precision) && len &&
+        ((len == precision) || (len == width))) {
+      if (unpadded_len < len) {
+        len--;
+      }
+      if (len && (base == base::hex || base == base::binary) &&
+          (unpadded_len < len)) {
+        len--;
+      }
+    }
+    if ((base == base::hex) && !(flags & flags::uppercase) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'x';
+    } else if (
+        (base == base::hex) && (flags & flags::uppercase) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'X';
+    } else if (
+        (base == base::binary) &&
+        (len < detail_::printf::integer_buffer_size)) {
+      buf[len++] = 'b';
+    }
+    if (len < detail_::printf::integer_buffer_size) {
+      buf[len++] = '0';
+    }
+  }
+  if (len < detail_::printf::integer_buffer_size) {
+    if (negative) {
+      buf[len++] = '-';
+    } else if (flags & flags::plus) {
+      buf[len++] = '+';
+    } else if (flags & flags::space) {
+      buf[len++] = ' ';
+    }
+  }
+  out_rev_(output, buf, len, width, flags);
+}
+__attribute__((device)) static void print_integer(
+    output_gadget_t* output,
+    unsigned_value_t value,
+    bool negative,
+    numeric_base_t base,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags) {
+  char buf[detail_::printf::integer_buffer_size];
+  printf_size_t len = 0U;
+  if (!value) {
+    if (!(flags & flags::precision)) {
+      buf[len++] = '0';
+      flags &= ~flags::hash;
+    } else if (base == base::hex) {
+      flags &= ~flags::hash;
+    }
+  } else {
+    do {
+      const char digit = (char)(value % base);
+      buf[len++] =
+          (char)(digit < 10 ? '0' + digit : (flags & flags::uppercase ? 'A' : 'a') + digit - 10);
+      value /= base;
+    } while (value && (len < detail_::printf::integer_buffer_size));
+  }
+  print_integer_finalization(
+      output, buf, len, negative, base, precision, width, flags);
+}
+__attribute__((device)) double power_of_10(int e) {
+  switch (e) {
+    case 0:
+      return 1e00;
+    case 1:
+      return 1e01;
+    case 2:
+      return 1e02;
+    case 3:
+      return 1e03;
+    case 4:
+      return 1e04;
+    case 5:
+      return 1e05;
+    case 6:
+      return 1e06;
+    case 7:
+      return 1e07;
+    case 8:
+      return 1e08;
+    case 9:
+      return 1e09;
+    case 10:
+      return 1e10;
+    case 11:
+      return 1e11;
+    case 12:
+      return 1e12;
+    case 13:
+      return 1e13;
+    case 14:
+      return 1e14;
+    case 15:
+      return 1e15;
+    case 16:
+      return 1e16;
+    case 17:
+      return 1e17;
+  }
+  return 1;
+}
+__attribute__((device)) static double_::components get_components(
+    double number,
+    printf_size_t precision) {
+  double_::components number_;
+  number_.is_negative = get_sign_bit(number);
+  double abs_number = (number_.is_negative) ? -number : number;
+  number_.integral = (int_fast64_t)abs_number;
+  double remainder =
+      (abs_number - (double)number_.integral) * power_of_10((int)precision);
+  number_.fractional = (int_fast64_t)remainder;
+  remainder -= (double)number_.fractional;
+  if (remainder > 0.5) {
+    ++number_.fractional;
+    if ((double)number_.fractional >= power_of_10((int)precision)) {
+      number_.fractional = 0;
+      ++number_.integral;
+    }
+  } else if (
+      (remainder == 0.5) &&
+      ((number_.fractional == 0U) || (number_.fractional & 1U))) {
+    ++number_.fractional;
+  }
+  if (precision == 0U) {
+    remainder = abs_number - (double)number_.integral;
+    if ((!(remainder < 0.5) || (remainder > 0.5)) && (number_.integral & 1)) {
+      ++number_.integral;
+    }
+  }
+  return number_;
+}
+struct scaling_factor {
+  double raw_factor;
+  bool multiply;
+};
+__attribute__((device)) static double apply_scaling(
+    double num,
+    scaling_factor normalization) {
+  return normalization.multiply ? num * normalization.raw_factor
+                                : num / normalization.raw_factor;
+}
+__attribute__((device)) static double unapply_scaling(
+    double normalized,
+    scaling_factor normalization) {
+  return normalization.multiply ? normalized / normalization.raw_factor
+                                : normalized * normalization.raw_factor;
+}
+__attribute__((device)) static scaling_factor update_normalization(
+    scaling_factor sf,
+    double extra_multiplicative_factor) {
+  scaling_factor result;
+  int factor_exp2 = get_exp2(sf.raw_factor);
+  int extra_factor_exp2 = get_exp2(extra_multiplicative_factor);
+  if (abs(factor_exp2) > abs(extra_factor_exp2)) {
+    result.multiply = false;
+    result.raw_factor = sf.raw_factor / extra_multiplicative_factor;
+  } else {
+    result.multiply = true;
+    result.raw_factor = extra_multiplicative_factor / sf.raw_factor;
+  }
+  return result;
+}
+__attribute__((device)) static double_::components get_normalized_components(
+    bool negative,
+    printf_size_t precision,
+    double non_normalized,
+    scaling_factor normalization,
+    int floored_exp10) {
+  double_::components components;
+  components.is_negative = negative;
+  double scaled = apply_scaling(non_normalized, normalization);
+  bool close_to_representation_extremum =
+      ((-floored_exp10 + (int)precision) >= DBL_MAX_10_EXP - 1);
+  if (close_to_representation_extremum) {
+    return get_components(negative ? -scaled : scaled, precision);
+  }
+  components.integral = (int_fast64_t)scaled;
+  double remainder = non_normalized -
+      unapply_scaling((double)components.integral, normalization);
+  double prec_power_of_10 = power_of_10((int)precision);
+  scaling_factor account_for_precision =
+      update_normalization(normalization, prec_power_of_10);
+  double scaled_remainder = apply_scaling(remainder, account_for_precision);
+  double rounding_threshold = 0.5;
+  components.fractional = (int_fast64_t)scaled_remainder;
+  scaled_remainder -= (double)components.fractional;
+  components.fractional += (scaled_remainder >= rounding_threshold);
+  if (scaled_remainder == rounding_threshold) {
+    components.fractional &= ~((int_fast64_t)0x1);
+  }
+  if ((double)components.fractional >= prec_power_of_10) {
+    components.fractional = 0;
+    ++components.integral;
+  }
+  return components;
+}
+__attribute__((device)) static void print_broken_up_decimal(
+    double_::components number_,
+    output_gadget_t* output,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* buf,
+    printf_size_t len) {
+  if (precision != 0U) {
+    printf_size_t count = precision;
+    if ((flags & flags::adapt_exp) && !(flags & flags::hash) &&
+        (number_.fractional > 0)) {
+      while (true) {
+        int_fast64_t digit = number_.fractional % 10U;
+        if (digit != 0) {
+          break;
+        }
+        --count;
+        number_.fractional /= 10U;
+      }
+    }
+    if (number_.fractional > 0 || !(flags & flags::adapt_exp) ||
+        (flags & flags::hash)) {
+      while (len < decimal_buffer_size) {
+        --count;
+        buf[len++] = (char)('0' + number_.fractional % 10U);
+        if (!(number_.fractional /= 10U)) {
+          break;
+        }
+      }
+      while ((len < decimal_buffer_size) && (count > 0U)) {
+        buf[len++] = '0';
+        --count;
+      }
+      if (len < decimal_buffer_size) {
+        buf[len++] = '.';
+      }
+    }
+  } else {
+    if ((flags & flags::hash) && (len < decimal_buffer_size)) {
+      buf[len++] = '.';
+    }
+  }
+  while (len < decimal_buffer_size) {
+    buf[len++] = (char)('0' + (number_.integral % 10));
+    if (!(number_.integral /= 10)) {
+      break;
+    }
+  }
+  if (!(flags & flags::left) && (flags & flags::zeropad)) {
+    if (width &&
+        (number_.is_negative || (flags & (flags::plus | flags::space)))) {
+      width--;
+    }
+    while ((len < width) && (len < decimal_buffer_size)) {
+      buf[len++] = '0';
+    }
+  }
+  if (len < decimal_buffer_size) {
+    if (number_.is_negative) {
+      buf[len++] = '-';
+    } else if (flags & flags::plus) {
+      buf[len++] = '+';
+    } else if (flags & flags::space) {
+      buf[len++] = ' ';
+    }
+  }
+  out_rev_(output, buf, len, width, flags);
+}
+__attribute__((device)) static void print_decimal_number(
+    output_gadget_t* __restrict__ output,
+    double number,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* __restrict__ buf,
+    printf_size_t len) {
+  double_::components value_ = get_components(number, precision);
+  print_broken_up_decimal(value_, output, precision, width, flags, buf, len);
+}
+__attribute__((device)) static void print_exponential_number(
+    output_gadget_t* __restrict__ output,
+    double number,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    char* __restrict__ buf,
+    printf_size_t len) {
+  const bool negative = get_sign_bit(number);
+  double abs_number = negative ? -number : number;
+  int floored_exp10;
+  bool abs_exp10_covered_by_powers_table;
+  scaling_factor normalization;
+  if (abs_number == 0.0) {
+    floored_exp10 = 0;
+  } else {
+    double exp10 = log10(abs_number);
+    floored_exp10 = floor(exp10);
+    double p10 = pow(10, floored_exp10);
+    normalization.raw_factor = p10;
+    abs_exp10_covered_by_powers_table = false;
+  }
+  bool fall_back_to_decimal_only_mode = false;
+  if (flags & flags::adapt_exp) {
+    int required_significant_digits = (precision == 0) ? 1 : (int)precision;
+    fall_back_to_decimal_only_mode =
+        (floored_exp10 >= -4 && floored_exp10 < required_significant_digits);
+    int precision_ = fall_back_to_decimal_only_mode
+        ? (int)precision - 1 - floored_exp10
+        : (int)precision - 1;
+    precision = (precision_ > 0 ? (unsigned)precision_ : 0U);
+    flags |= flags::precision;
+  }
+  normalization.multiply =
+      (floored_exp10 < 0 && abs_exp10_covered_by_powers_table);
+  bool should_skip_normalization =
+      (fall_back_to_decimal_only_mode || floored_exp10 == 0);
+  double_::components decimal_part_components = should_skip_normalization
+      ? get_components(negative ? -abs_number : abs_number, precision)
+      : get_normalized_components(
+            negative, precision, abs_number, normalization, floored_exp10);
+  if (fall_back_to_decimal_only_mode) {
+    if ((flags & flags::adapt_exp) && floored_exp10 >= -1 &&
+        decimal_part_components.integral == power_of_10(floored_exp10 + 1)) {
+      floored_exp10++;
+      precision--;
+    }
+  } else {
+    if (decimal_part_components.integral >= 10) {
+      floored_exp10++;
+      decimal_part_components.integral = 1;
+      decimal_part_components.fractional = 0;
+    }
+  }
+  printf_size_t exp10_part_width = fall_back_to_decimal_only_mode ? 0U
+      : (abs(floored_exp10) < 100)                                ? 4U
+                                                                  : 5U;
+  printf_size_t decimal_part_width = ((flags & flags::left) && exp10_part_width)
+      ? 0U
+      : ((width > exp10_part_width) ? width - exp10_part_width : 0U);
+  const printf_size_t printed_exponential_start_pos = output->pos;
+  print_broken_up_decimal(
+      decimal_part_components,
+      output,
+      precision,
+      decimal_part_width,
+      flags,
+      buf,
+      len);
+  if (!fall_back_to_decimal_only_mode) {
+    putchar_via_gadget(output, (flags & flags::uppercase) ? 'E' : 'e');
+    print_integer(
+        output,
+        abs_for_printing(floored_exp10),
+        floored_exp10 < 0,
+        10,
+        0,
+        exp10_part_width - 1,
+        flags::zeropad | flags::plus);
+    if (flags & flags::left) {
+      while (output->pos - printed_exponential_start_pos < width) {
+        putchar_via_gadget(output, ' ');
+      }
+    }
+  }
+}
+__attribute__((device)) static void print_floating_point(
+    output_gadget_t* output,
+    double value,
+    printf_size_t precision,
+    printf_size_t width,
+    flags_t flags,
+    bool prefer_exponential) {
+  char buf[decimal_buffer_size];
+  printf_size_t len = 0U;
+  if (value != value) {
+    out_rev_(output, "nan", 3, width, flags);
+    return;
+  }
+  if (value < -DBL_MAX) {
+    out_rev_(output, "fni-", 4, width, flags);
+    return;
+  }
+  if (value > DBL_MAX) {
+    out_rev_(
+        output,
+        (flags & flags::plus) ? "fni+" : "fni",
+        (flags & flags::plus) ? 4U : 3U,
+        width,
+        flags);
+    return;
+  }
+  if (!prefer_exponential &&
+      ((value > float_notation_threshold) ||
+       (value < -float_notation_threshold))) {
+    print_exponential_number(output, value, precision, width, flags, buf, len);
+    return;
+  }
+  if (!(flags & flags::precision)) {
+    precision = default_float_precision;
+  }
+  while ((len < decimal_buffer_size) && (precision > max_supported_precision)) {
+    buf[len++] = '0';
+    precision--;
+  }
+  if (prefer_exponential)
+    print_exponential_number(output, value, precision, width, flags, buf, len);
+  else
+    print_decimal_number(output, value, precision, width, flags, buf, len);
+}
+__attribute__((device)) static flags_t parse_flags(const char** format) {
+  flags_t flags = 0U;
+  do {
+    switch (**format) {
+      case '0':
+        flags |= flags::zeropad;
+        (*format)++;
+        break;
+      case '-':
+        flags |= flags::left;
+        (*format)++;
+        break;
+      case '+':
+        flags |= flags::plus;
+        (*format)++;
+        break;
+      case ' ':
+        flags |= flags::space;
+        (*format)++;
+        break;
+      case '#':
+        flags |= flags::hash;
+        (*format)++;
+        break;
+      default:
+        return flags;
+    }
+  } while (true);
+}
+__attribute__((device)) static int vsnprintf(
+    output_gadget_t* output,
+    const char* format,
+    va_list args) {
+  while (*format) {
+    if (*format != '%') {
+      putchar_via_gadget(output, *format);
+      format++;
+      continue;
+    } else {
+      format++;
+    }
+    flags_t flags = parse_flags(&format);
+    printf_size_t width = 0U;
+    if (is_digit_(*format)) {
+      width = (printf_size_t)atou_(&format);
+    } else if (*format == '*') {
+      const int w = __builtin_va_arg(args, int);
+      if (w < 0) {
+        flags |= flags::left;
+        width = (printf_size_t)-w;
+      } else {
+        width = (printf_size_t)w;
+      }
+      format++;
+    }
+    printf_size_t precision = 0U;
+    if (*format == '.') {
+      flags |= flags::precision;
+      format++;
+      if (is_digit_(*format)) {
+        precision = (printf_size_t)atou_(&format);
+      } else if (*format == '*') {
+        const int precision_ = __builtin_va_arg(args, int);
+        precision = precision_ > 0 ? (printf_size_t)precision_ : 0U;
+        format++;
+      }
+    }
+    switch (*format) {
+      case 'I': {
+        format++;
+        switch (*format) {
+          case '8':
+            flags |= flags::int8;
+            format++;
+            break;
+          case '1':
+            format++;
+            if (*format == '6') {
+              format++;
+              flags |= flags::int16;
+            }
+            break;
+          case '3':
+            format++;
+            if (*format == '2') {
+              format++;
+              flags |= flags::int32;
+            }
+            break;
+          case '6':
+            format++;
+            if (*format == '4') {
+              format++;
+              flags |= flags::int64;
+            }
+            break;
+          default:
+            break;
+        }
+        break;
+      }
+      case 'l':
+        flags |= flags::long_;
+        format++;
+        if (*format == 'l') {
+          flags |= flags::long_long;
+          format++;
+        }
+        break;
+      case 'h':
+        flags |= flags::short_;
+        format++;
+        if (*format == 'h') {
+          flags |= flags::char_;
+          format++;
+        }
+        break;
+      case 't':
+      case 'j':
+      case 'z':
+        static_assert(
+            sizeof(ptrdiff_t) == sizeof(long), "Unexpected sizeof(ptrdiff_t)");
+        static_assert(
+            sizeof(intmax_t) == sizeof(long), "Unexpected sizeof(intmax_t)");
+        static_assert(
+            sizeof(size_t) == sizeof(long), "Unexpected sizeof(size_t)");
+        flags |= flags::long_;
+        format++;
+        break;
+      default:
+        break;
+    }
+    switch (*format) {
+      case 'd':
+      case 'i':
+      case 'u':
+      case 'x':
+      case 'X':
+      case 'o':
+      case 'b': {
+        if (*format == 'd' || *format == 'i') {
+          flags |= flags::signed_;
+        }
+        numeric_base_t base;
+        if (*format == 'x' || *format == 'X') {
+          base = base::hex;
+        } else if (*format == 'o') {
+          base = base::octal;
+        } else if (*format == 'b') {
+          base = base::binary;
+        } else {
+          base = base::decimal;
+          flags &= ~flags::hash;
+        }
+        if (*format == 'X') {
+          flags |= flags::uppercase;
+        }
+        format++;
+        if (flags & flags::precision) {
+          flags &= ~flags::zeropad;
+        }
+        if (flags & flags::signed_) {
+          if (flags & flags::long_long) {
+            const long long value = __builtin_va_arg(args, long long);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          } else if (flags & flags::long_) {
+            const long value = __builtin_va_arg(args, long);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          } else {
+            const int value = (flags & flags::char_)
+                ? (signed char)__builtin_va_arg(args, int)
+                : (flags & flags::short_)
+                ? (short int)__builtin_va_arg(args, int)
+                : __builtin_va_arg(args, int);
+            print_integer(
+                output,
+                abs_for_printing(value),
+                value < 0,
+                base,
+                precision,
+                width,
+                flags);
+          }
+        } else {
+          flags &= ~(flags::plus | flags::space);
+          if (flags & flags::long_long) {
+            print_integer(
+                output,
+                (unsigned_value_t) __builtin_va_arg(args, unsigned long long),
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          } else if (flags & flags::long_) {
+            print_integer(
+                output,
+                (unsigned_value_t) __builtin_va_arg(args, unsigned long),
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          } else {
+            const unsigned int value = (flags & flags::char_)
+                ? (unsigned char)__builtin_va_arg(args, unsigned int)
+                : (flags & flags::short_)
+                ? (unsigned short int)__builtin_va_arg(args, unsigned int)
+                : __builtin_va_arg(args, unsigned int);
+            print_integer(
+                output,
+                (unsigned_value_t)value,
+                false,
+                base,
+                precision,
+                width,
+                flags);
+          }
+        }
+        break;
+      }
+        enum : bool { prefer_decimal = false, prefer_exponential = true };
+      case 'f':
+      case 'F':
+        if (*format == 'F')
+          flags |= flags::uppercase;
+        print_floating_point(
+            output,
+            __builtin_va_arg(args, double),
+            precision,
+            width,
+            flags,
+            prefer_decimal);
+        format++;
+        break;
+      case 'e':
+      case 'E':
+      case 'g':
+      case 'G':
+        if ((*format == 'g') || (*format == 'G'))
+          flags |= flags::adapt_exp;
+        if ((*format == 'E') || (*format == 'G'))
+          flags |= flags::uppercase;
+        print_floating_point(
+            output,
+            __builtin_va_arg(args, double),
+            precision,
+            width,
+            flags,
+            prefer_exponential);
+        format++;
+        break;
+      case 'c': {
+        printf_size_t l = 1U;
+        if (!(flags & flags::left)) {
+          while (l++ < width) {
+            putchar_via_gadget(output, ' ');
+          }
+        }
+        putchar_via_gadget(output, (char)__builtin_va_arg(args, int));
+        if (flags & flags::left) {
+          while (l++ < width) {
+            putchar_via_gadget(output, ' ');
+          }
+        }
+        format++;
+        break;
+      }
+      case 's': {
+        const char* p = __builtin_va_arg(args, char*);
+        if (p == nullptr) {
+          out_rev_(output, ")llun(", 6, width, flags);
+        } else {
+          printf_size_t l =
+              strnlen_s_(p, precision ? precision : max_possible_buffer_size);
+          if (flags & flags::precision) {
+            l = (l < precision ? l : precision);
+          }
+          if (!(flags & flags::left)) {
+            while (l++ < width) {
+              putchar_via_gadget(output, ' ');
+            }
+          }
+          while ((*p != 0) && (!(flags & flags::precision) || precision)) {
+            putchar_via_gadget(output, *(p++));
+            --precision;
+          }
+          if (flags & flags::left) {
+            while (l++ < width) {
+              putchar_via_gadget(output, ' ');
+            }
+          }
+        }
+        format++;
+        break;
+      }
+      case 'p': {
+        width = sizeof(void*) * 2U + 2;
+        flags |= flags::zeropad | flags::pointer;
+        uintptr_t value = (uintptr_t) __builtin_va_arg(args, void*);
+        (value == (uintptr_t) nullptr)
+            ? out_rev_(output, ")lin(", 5, width, flags)
+            : print_integer(
+                  output,
+                  (unsigned_value_t)value,
+                  false,
+                  base::hex,
+                  precision,
+                  width,
+                  flags);
+        format++;
+        break;
+      }
+      case '%':
+        putchar_via_gadget(output, '%');
+        format++;
+        break;
+      case 'n': {
+        if (flags & flags::char_)
+          *(__builtin_va_arg(args, char*)) = (char)output->pos;
+        else if (flags & flags::short_)
+          *(__builtin_va_arg(args, short*)) = (short)output->pos;
+        else if (flags & flags::long_)
+          *(__builtin_va_arg(args, long*)) = (long)output->pos;
+        else if (flags & flags::long_long)
+          *(__builtin_va_arg(args, long long*)) = (long long int)output->pos;
+        else
+          *(__builtin_va_arg(args, int*)) = (int)output->pos;
+        format++;
+        break;
+      }
+      default:
+        putchar_via_gadget(output, *format);
+        format++;
+        break;
+    }
+  }
+  append_termination_with_gadget(output);
+  return (int)output->pos;
+}
+} // namespace printf
+} // namespace detail_
+__attribute__((device)) int vprintf(const char* format, va_list arg) {
+  detail_::printf::output_gadget_t gadget =
+      detail_::printf::discarding_gadget();
+  int ret = vsnprintf(&gadget, format, arg);
+  if (ret < 0) {
+    return ret;
+  }
+  size_t count = ret + 1;
+  char* scratch = (char*)malloc(count);
+  if (scratch == nullptr) {
+    return -1;
+  }
+  ret = vsnprintf(scratch, count, format, arg);
+  if (ret < 0) {
+    free(scratch);
+    return ret;
+  }
+  ret = printf("%s", scratch);
+}
+__attribute__((device)) int vsnprintf(
+    char* s,
+    size_t n,
+    const char* format,
+    va_list arg) {
+  detail_::printf::output_gadget_t gadget =
+      detail_::printf::buffer_gadget(s, n);
+  return detail_::printf::vsnprintf(&gadget, format, arg);
+}
+__attribute__((device)) int vsprintf(char* s, const char* format, va_list arg) {
+  return vsnprintf(s, detail_::printf::max_possible_buffer_size, format, arg);
+}
+__attribute__((device)) inline int vnprintf_with_scratch(
+    char* scratch,
+    size_t count,
+    const char* format,
+    va_list arg) {
+  const int ret = vsnprintf(scratch, count, format, arg);
+  if (scratch == nullptr) {
+    return ret;
+  }
+  if (ret > 0) {
+    return printf("%s", scratch);
+  }
+};
+__attribute__((device)) int printf(const char* format, ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vprintf(format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int sprintf(char* s, const char* format, ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vsprintf(s, format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int snprintf(
+    char* s,
+    size_t n,
+    const char* format,
+    ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  const int ret = vsnprintf(s, n, format, args);
+  __builtin_va_end(args);
+  return ret;
+}
+__attribute__((device)) int nprintf_with_scratch(
+    char* scratch_buffer,
+    size_t count,
+    const char* format,
+    ...) {
+  va_list args;
+  __builtin_va_start(args, format);
+  return vnprintf_with_scratch(scratch_buffer, count, format, args);
+}
+} // namespace kat
diff --git a/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h b/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h
new file mode 100644
index 000000000..457fbc49e
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/attention_scaling_coefs_updater.h
@@ -0,0 +1,514 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+#include "gemm_kernel_utils.h"
+
+namespace {
+
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+} // namespace
+
+/* Iterates on the accumulator and corresponding position on result matrix
+
+(1) Update `mi[r]` to the max value of the row `r`
+(2) In a second iteration do the following:
+    (a) accum   <- exp(accum - mi)
+    (b) m_prime <- exp(m_prime - mi)
+    (c) s_prime <- s_prime * m_prime + sum(accum)
+
+All of this is done on registers, before we store all of this
+on shared memory for the next matmul with Value.
+
+We have multiple implementations, because each configuration has a different way
+of iterating in the accumulators.
+*/
+
+template <typename BASE, typename T, typename accum_t, int kWarpSize>
+struct RegisterOps {
+  template <
+      int kQueriesPerBlock,
+      bool kFullColumns,
+      bool kIsFirst,
+      bool kKeepOutputInRF>
+  CUTLASS_DEVICE static void update(
+      typename T::Fragment& frag_o, // output so far
+      typename T::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int16_t max_col,
+      typename T::TensorCoord const& tile_offset,
+      float scaling) {
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+    if (!kIsFirst) {
+      if (thread_id < kQueriesPerBlock) {
+        m_prime[thread_id] = mi[thread_id];
+      }
+      __syncthreads();
+    }
+
+    auto lane_offset = BASE::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (kFullColumns || accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max * scaling);
+          });
+    }
+    frag = cutlass::multiplies<typename T::Fragment>()(scaling * kLog2e, frag);
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    if (thread_id < kQueriesPerBlock) {
+      auto m_prime_exp = exp2f(kLog2e * (m_prime[thread_id] - mi[thread_id]));
+      m_prime[thread_id] = m_prime_exp;
+      s_prime[thread_id] *= m_prime_exp;
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !kIsFirst) {
+      accum_t mp;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mp = m_prime[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) { frag_o[idx] *= mp; },
+          [&](int accum_m) {});
+      __syncthreads();
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = kLog2e * mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] = (kFullColumns || accum_n < max_col)
+                ? exp2f(frag[idx] - mi_row)
+                : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      BASE::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (BASE::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              atomicAdd(&s_prime[accum_m], total_row);
+            }
+          });
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSm80
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSm80<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterVolta
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterVolta<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    static_assert(
+        cutlass::platform::is_same<Element, float>::value,
+        "update to support non-float accum");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+    // T0 & T2 share same line within a quad
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+    myValue = fn(myValue, otherV);
+    // quad 0 and quad 2 are on the same lines
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+    myValue = fn(myValue, otherV);
+    return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AttentionScalingCoefsUpdaterSimt
+    : RegisterOps<
+          AttentionScalingCoefsUpdaterSimt<T, accum_t, kWarpSize>,
+          T,
+          accum_t,
+          kWarpSize> {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+      auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+      myValue = fn(myValue, otherV);
+    }
+    return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using Iterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSimt<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Updater =
+      AttentionScalingCoefsUpdaterVolta<Iterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultAttentionScalingCoefsUpdater<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using Iterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Updater =
+      AttentionScalingCoefsUpdaterSm80<Iterator, accum_t, kWarpSize>;
+};
diff --git a/static/include/kernels/mem_eff_attention/debug_utils.h b/static/include/kernels/mem_eff_attention/debug_utils.h
new file mode 100644
index 000000000..ccdff22de
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/debug_utils.h
@@ -0,0 +1,160 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <float.h>
+#include <stdio.h>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (int _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 0
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_T0_L0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", __VA_ARGS__);                                    \
+  }
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+#define PRINT_T0_L0
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_T0_L0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_T0_L0("printing %s (%s)", name, typeStr.data);      \
+    for (int _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_T0_L0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_T0_L0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_T0_L0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
diff --git a/static/include/kernels/mem_eff_attention/default_fmha_grouped.h b/static/include/kernels/mem_eff_attention/default_fmha_grouped.h
new file mode 100644
index 000000000..f33f8a98b
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/default_fmha_grouped.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix
+   multiply-add with the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major
+   outputs are accommodated by exchanging A and B operands and assuming
+   transposed layouts. Partial specializations here choose
+   'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "find_default_mma.h"
+#include "fmha_grouped.h"
+#include "gemm_kernel_utils.h"
+#include "mma_from_smem.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    bool kSingleValueIteration,
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly>
+struct DefaultFMHAGrouped {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using output_t = scalar_t;
+
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+
+  using ArchTag = ArchTag_;
+  static bool const kIsAligned = isAligned_;
+  static int const kWarpSize = 32;
+  static int const kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (kWarpSize * kWarpSize);
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+
+    using GemmType = gemm_kernel_utils::DefaultGemmType<ArchTag, scalar_t>;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = scalar_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator>;
+
+    static int const kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementAccumulator,
+        LayoutC,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        kStages,
+        Operator>::DefaultMma;
+
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        ElementAccumulator,
+        kWarpSize>::Updater;
+
+    static_assert(MmaCore::WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /*
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+
+    using GemmType = typename MM0::GemmType;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = output_accum_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator>;
+
+    static int const kAlignmentA = DefaultConfig::kAlignmentA;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = typename MM0::ThreadblockShape;
+    using WarpShape = typename MM0::WarpShape;
+    using InstructionShape = typename MM0::InstructionShape;
+
+    using EpilogueOutputOp = typename DefaultConfig::EpilogueOutputOp;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using ThreadblockSwizzle = void; // Swizzling is unused
+    static bool const kSplitKSerial = false;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EpilogueOutputOp,
+        ThreadblockSwizzle,
+        kStages,
+        kSplitKSerial,
+        Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  /// Define the kernel in terms of the default kernel
+  using FMHAKernel = kernel::FMHAGrouped<
+      MM0,
+      MM1,
+      scalar_t,
+      accum_t,
+      output_t,
+      output_accum_t,
+      kSingleValueIteration,
+      GroupScheduleMode_>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_pipelined.h b/static/include/kernels/mem_eff_attention/epilogue_pipelined.h
new file mode 100644
index 000000000..2a574e71f
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_pipelined.h
@@ -0,0 +1,632 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentOutput const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h b/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h
new file mode 100644
index 000000000..fce9c431b
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_rescale_output.h
@@ -0,0 +1,263 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#if defined(__CUDACC_RTC__)
+#include <cuda/std/cassert>
+#else
+#include <assert.h>
+#endif
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h b/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 000000000..2e286d3f4
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,175 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+      Array<Element, ElementsPerAccess> const& input) const {
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = expf(input[i]);
+    }
+
+    return result;
+  }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+      Array<half_t, ElementsPerAccess> const& input) const {
+    Array<half_t, ElementsPerAccess> result;
+
+    int const kVectorCount = ElementsPerAccess / 2;
+
+    __half2 const* input_ptr =
+        reinterpret_cast<__half2 const*>(input.raw_data());
+    __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = h2exp(input_ptr[i]);
+    }
+
+    return result;
+  }
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <
+    typename ElementOutput_, // output
+    typename ElementLSE_, // accumulator from LSE
+    typename ElementAccumulator_, // accumulator from matmul
+    typename ElementCompute_, // intermediate compute (and exp calculation)
+    int ElementsPerAccess>
+class ApplyLogSumExp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementLSE = ElementLSE_;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+  using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ApplyLogSumExp() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& AB,
+      FragmentLSE const& scale_unused,
+      // bias used as LSE
+      FragmentLSE const& bias) const {
+    FragmentCompute frag_AB = NumericArrayConverter<
+        ElementCompute,
+        ElementAccumulator,
+        kElementsPerAccess>()(AB);
+    FragmentCompute frag_lse_compute =
+        NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(
+            bias);
+    FragmentCompute frag_compute;
+
+    minus<FragmentCompute> minus_lse;
+    detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+    frag_compute = minus_lse(frag_AB, frag_lse_compute);
+    frag_compute = apply_exp(frag_compute);
+
+    return NumericArrayConverter<
+        ElementOutput,
+        ElementCompute,
+        kElementsPerAccess>()(frag_compute);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/find_default_mma.h b/static/include/kernels/mem_eff_attention/find_default_mma.h
new file mode 100644
index 000000000..7f9c99732
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/find_default_mma.h
@@ -0,0 +1,190 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instanciate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation perfomed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/static/include/kernels/mem_eff_attention/fmha_grouped.h b/static/include/kernels/mem_eff_attention/fmha_grouped.h
new file mode 100644
index 000000000..d48258569
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/fmha_grouped.h
@@ -0,0 +1,859 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped FMHA kernel
+*/
+
+#pragma once
+
+#include "cutlass/complex.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+
+#include "epilogue_rescale_output.h"
+#include "fmha_grouped_problem_visitor.h"
+#include "gemm_kernel_utils.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename MM0_, ///! Structure for computing P = Q @ K
+    typename MM1_, ///! Structure for computing O = P @ V
+    typename scalar_t_,
+    typename accum_t_,
+    typename output_t_,
+    typename output_accum_t_,
+    bool kKeepOutputInRF, ///! Whether the intermediate output from MM0_ should
+                          /// be kept in the register file
+    GroupScheduleMode GroupScheduleMode_ ///! Type of scheduling to perform
+    >
+struct FMHAGrouped {
+ public:
+  using MM0 = MM0_;
+  using MM1 = MM1_;
+
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = output_t_;
+  using output_accum_t = output_accum_t_;
+
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  // Parameters to satisfy BaseGrouped
+  using ElementA = scalar_t;
+  using ElementB = scalar_t;
+  using ElementC = accum_t;
+  using LayoutA = typename MM0::LayoutA;
+  using LayoutB = typename MM0::ElementB;
+  using LayoutC = typename MM1::ElementC;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static int const kAlignmentA = MM0::kAlignmentA;
+  static int const kAlignmentB = MM0::kAlignmentB;
+  static int const kAlignmentC = 1;
+  using Mma = typename MM1::Mma;
+  using EpilogueOutputOp = typename MM1::EpilogueOutputOp;
+  using ThreadblockSwizzle = void;
+  using Operator = typename MM1::Operator;
+  using WarpShape = typename MM1::WarpShape;
+  using InstructionShape = typename MM1::InstructionShape;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+  using ElementAccumulator = accum_t;
+
+  using LayoutQ = typename MM0::LayoutA;
+  using LayoutK = typename MM0::LayoutB;
+  using LayoutP = typename MM0::LayoutC;
+  using LayoutV = typename MM1::LayoutB;
+  using LayoutO = typename MM1::LayoutC;
+
+  static bool const kPreloadV =
+      (MM1::Mma::ArchTag::kMinComputeCapability >= 80 &&
+       cutlass::sizeof_bits<ElementV>::value == 16);
+
+  static int const kAlignmentQ = MM0::kAlignmentA;
+  static int const kAlignmentK = MM0::kAlignmentB;
+  static int const kAlignmentV = 1;
+
+  using ThreadblockShape = typename MM0::ThreadblockShape;
+
+  static int const kQueriesPerBlock = ThreadblockShape::kM;
+  static int const kKeysPerBlock = ThreadblockShape::kN;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename MM1::WarpCount;
+  static int const kThreadsPerWarp = 32;
+  static int const kThreadCount = kThreadsPerWarp * WarpCount::kCount;
+
+  using ProblemVisitor = FMHAGroupedProblemVisitor<
+      ThreadblockShape,
+      kGroupScheduleMode,
+      kThreadCount,
+      kThreadCount>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+    //
+    // Data members
+    //
+
+    GemmCoord* problem_sizes0;
+    GemmCoord* problem_sizes1;
+
+    int problem_count;
+    int threadblock_count;
+
+    ElementQ** ptr_Q;
+    ElementK** ptr_K;
+    ElementP** ptr_P;
+    ElementV** ptr_V;
+    ElementO** ptr_O;
+    ElementOAccum** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex* ldq;
+    typename LayoutK::Stride::LongIndex* ldk;
+    typename LayoutP::Stride::LongIndex* ldv;
+    typename LayoutO::Stride::LongIndex* ldo;
+
+    // Whether causal masking is to be performed
+    bool causal;
+
+    // Only used by device-level operator
+    GemmCoord* host_problem_sizes;
+
+    //
+    // Methods
+    //
+
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Arguments()
+        : problem_count(0),
+          threadblock_count(0),
+          ptr_Q(nullptr),
+          ptr_K(nullptr),
+          ptr_P(nullptr),
+          ptr_V(nullptr),
+          ptr_O(nullptr),
+          ptr_O_accum(nullptr),
+          ldq(nullptr),
+          ldk(nullptr),
+          ldv(nullptr),
+          ldo(nullptr),
+          causal(false),
+          host_problem_sizes(nullptr) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+        GemmCoord* problem_sizes0,
+        GemmCoord* problem_sizes1,
+        int problem_count,
+        int threadblock_count,
+        ElementQ** ptr_Q,
+        ElementK** ptr_K,
+        ElementP** ptr_P,
+        ElementV** ptr_V,
+        ElementO** ptr_O,
+        ElementOAccum** ptr_O_accum,
+        typename LayoutQ::Stride::LongIndex* ldq,
+        typename LayoutK::Stride::LongIndex* ldk,
+        typename LayoutP::Stride::LongIndex* ldp,
+        typename LayoutV::Stride::LongIndex* ldv,
+        typename LayoutO::Stride::LongIndex* ldo,
+        bool causal,
+        GemmCoord* host_problem_sizes = nullptr)
+        : problem_sizes0(problem_sizes0),
+          problem_sizes1(problem_sizes1),
+          problem_count(problem_count),
+          threadblock_count(threadblock_count),
+          ptr_Q(ptr_Q),
+          ptr_K(ptr_K),
+          ptr_P(ptr_P),
+          ptr_V(ptr_V),
+          ptr_O(ptr_O),
+          ptr_O_accum(
+              kNeedsOutputAccumulatorBuffer ? ptr_O_accum : (accum_t**)ptr_O),
+          ldq(ldq),
+          ldk(ldk),
+          ldv(ldv),
+          ldo(ldo),
+          causal(causal),
+          host_problem_sizes(host_problem_sizes) {}
+
+    bool __host__ check_supported() {
+      CHECK_ALIGNED_PTR(ptr_Q, kAlignmentQ);
+      CHECK_ALIGNED_PTR(ptr_K, kAlignmentK);
+      CHECK_ALIGNED_PTR(ptr_V, kAlignmentV);
+      XFORMERS_CHECK(ldq % kAlignmentQ == 0, "query is not correctly aligned");
+      XFORMERS_CHECK(ldk % kAlignmentK == 0, "key is not correctly aligned");
+      XFORMERS_CHECK(ldv % kAlignmentV == 0, "value is not correctly aligned");
+      return true;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    ElementQ** ptr_Q;
+    ElementK** ptr_K;
+    ElementP** ptr_P;
+    ElementV** ptr_V;
+    ElementO** ptr_O;
+    ElementOAccum** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex* ldq;
+    typename LayoutK::Stride::LongIndex* ldk;
+    typename LayoutP::Stride::LongIndex* ldv;
+    typename LayoutO::Stride::LongIndex* ldo;
+
+    bool causal;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params()
+        : ptr_Q(nullptr),
+          ptr_K(nullptr),
+          ptr_P(nullptr),
+          ptr_V(nullptr),
+          ptr_O(nullptr),
+          ptr_O_accum(nullptr),
+          ldq(nullptr),
+          ldk(nullptr),
+          ldv(nullptr),
+          ldo(nullptr),
+          causal(false) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0)
+        : problem_visitor(
+              args.problem_sizes0,
+              args.problem_sizes1,
+              args.problem_count,
+              workspace,
+              tile_count),
+          threadblock_count(args.threadblock_count),
+          ptr_Q(args.ptr_Q),
+          ptr_K(args.ptr_K),
+          ptr_P(args.ptr_P),
+          ptr_V(args.ptr_V),
+          ptr_O(args.ptr_O),
+          ptr_O_accum(
+              kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum
+                                            : (accum_t**)args.ptr_O),
+          ldq(args.ldq),
+          ldk(args.ldk),
+          ldv(args.ldv),
+          ldo(args.ldo),
+          causal(args.causal) {}
+
+    CUTLASS_HOST_DEVICE
+    void update(
+        Arguments const& args,
+        void* workspace = nullptr,
+        int tile_count = 0) {
+      problem_visitor = typename ProblemVisitor::Params(
+          args.problem_sizes0,
+          args.problem_sizes1,
+          args.problem_count,
+          workspace,
+          tile_count);
+      threadblock_count = args.threadblock_count;
+      ptr_Q = args.ptr_Q;
+      ptr_K = args.ptr_K;
+      ptr_P = args.ptr_P;
+      ptr_V = args.ptr_V;
+      ptr_O = args.ptr_O;
+      ptr_O_accum = kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum
+                                                  : (accum_t**)args.ptr_O;
+      ldq = args.ldq;
+      ldk = args.ldk;
+      ldv = args.ldv;
+      ldo = args.ldo;
+      causal = args.causal;
+    }
+  };
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> m_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> s_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> mi;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+ private:
+  // Parameters to be used by an individual tile
+  struct TileParams {
+    CUTLASS_HOST_DEVICE
+    static int query_start(int threadblock_idx) {
+      return threadblock_idx * kQueriesPerBlock;
+    }
+
+    // Returns whether this threadblock computes within the number of queries,
+    // which is determined by the M dimension of problem 0
+    CUTLASS_HOST_DEVICE
+    static bool can_compute(
+        int threadblock_idx,
+        const GemmCoord& problem_size0) {
+      return query_start(threadblock_idx) < problem_size0.m();
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_queries(
+        int threadblock_idx,
+        const GemmCoord& problem_size0) {
+      return problem_size0.m() - query_start(threadblock_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_keys(
+        int threadblock_idx,
+        const GemmCoord& problem_size0,
+        bool causal) {
+      int nk = problem_size0.n();
+      if (causal) {
+        nk = cutlass::fast_min(
+            int32_t(query_start(threadblock_idx) + kQueriesPerBlock), nk);
+      }
+      return nk;
+    }
+  };
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  FMHAGrouped() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const& args) {
+    return Status::kSuccess;
+  }
+
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x;
+  }
+
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.x / kThreadsPerWarp;
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x % kThreadsPerWarp;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const& params, SharedStorage& shared_storage) {
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+
+    ProblemVisitor problem_visitor(
+        params.problem_visitor, shared_storage.problem_visitor, blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      const int32_t threadblock_idx =
+          int32_t(problem_visitor.threadblock_idx());
+
+      if (!TileParams::can_compute(threadblock_idx, problem_size0)) {
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      const int32_t problem_idx = problem_visitor.problem_index();
+
+      if (thread_id() < kQueriesPerBlock) {
+        s_prime[thread_id()] = ElementAccumulator(0);
+        m_prime[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+        mi[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+      }
+
+      ElementO* ptr_O = params.ptr_O[problem_idx] +
+          TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      ElementOAccum* ptr_O_accum = params.ptr_O_accum[problem_idx] +
+          TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      const int num_queries =
+          TileParams::num_queries(threadblock_idx, problem_size0);
+
+      auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+        using OutputTileIterator = typename MM1::OutputTileIterator;
+        return OutputTileIterator(
+            typename OutputTileIterator::Params{
+                (int32_t)params.ldo[problem_idx]},
+            ptr_O,
+            typename OutputTileIterator::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      auto createOutputAccumIter =
+          [&](int col) -> typename MM1::OutputTileIteratorAccum {
+        using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+        return OutputTileIteratorAccum(
+            typename OutputTileIteratorAccum::Params{
+                (int32_t)params.ldo[problem_idx]},
+            ptr_O_accum,
+            typename OutputTileIteratorAccum::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      typename MM1::Mma::FragmentC accum_o;
+      accum_o.clear();
+
+      const int num_keys =
+          TileParams::num_keys(threadblock_idx, problem_size0, params.causal);
+
+      for (int32_t iter_key_start = 0; iter_key_start < num_keys;
+           iter_key_start += kKeysPerBlock) {
+        int32_t problem_size_0_m =
+            cutlass::fast_min((int32_t)kQueriesPerBlock, num_queries);
+        int32_t problem_size_0_n = cutlass::fast_min(
+            (int32_t)kKeysPerBlock, num_keys - iter_key_start);
+        int32_t const& problem_size_0_k = problem_size0.k();
+        int32_t const& problem_size_1_n = problem_size1.n();
+        int32_t const& problem_size_1_k = problem_size_0_n;
+
+        auto prologueV = [&](int blockN) {
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{
+                  MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] +
+                  iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          MM1::Mma::prologue(
+              shared_storage.after_mm0.mm1.mm,
+              iterator_V,
+              thread_id(),
+              problem_size_1_k);
+        };
+
+        __syncthreads(); // Need to have shared memory initialized, and
+                         // `m_prime` updated from end of prev iter
+
+        //
+        // MATMUL: Q.K_t
+        //
+        // Computes the block-matrix product of:
+        // (a) query[query_start:query_end, :]
+        // with
+        // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+        // and stores that into `shared_storage.si`
+        //
+
+        ElementQ* ptr_Q = params.ptr_Q[problem_idx] +
+            TileParams::query_start(threadblock_idx) * params.ldq[problem_idx];
+
+        // Construct iterators to A and B operands
+        typename MM0::IteratorA iterator_A(
+            typename MM0::IteratorA::Params(
+                typename MM0::MmaCore::LayoutA(params.ldq[problem_idx])),
+            ptr_Q,
+            {problem_size_0_m, problem_size_0_k},
+            thread_id(),
+            {0, 0});
+
+        typename MM0::IteratorB iterator_B(
+            typename MM0::IteratorB::Params(
+                typename MM0::MmaCore::LayoutB(params.ldk[problem_idx])),
+            params.ptr_K[problem_idx] +
+                iter_key_start * params.ldk[problem_idx],
+            {problem_size_0_k, problem_size_0_n},
+            thread_id(),
+            {0, 0});
+
+        // Construct thread-scoped matrix multiply
+        typename MM0::Mma mma(
+            shared_storage.mm0, thread_id(), warp_id(), lane_id());
+
+        typename MM0::Mma::FragmentC accum;
+
+        accum.clear();
+
+        auto gemm_k_iterations =
+            (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+        __syncthreads();
+
+        if (kPreloadV) {
+          prologueV(0);
+        }
+
+        typename MM0::Mma::Operator::IteratorC::TensorCoord
+            iteratorC_tile_offset = {
+                (warp_id() % MM0::Mma::WarpCount::kM),
+                (warp_id() / MM0::Mma::WarpCount::kM)};
+
+        // Mask out last if causal
+        if (params.causal && num_keys - iter_key_start <= kKeysPerBlock) {
+          auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+              lane_id(), warp_id(), iteratorC_tile_offset);
+          int32_t last_col;
+          MM0::ScalingCoefsUpdater::iterateRows(
+              lane_offset,
+              [&](int accum_m) {
+                last_col = TileParams::query_start(threadblock_idx) + accum_m -
+                    iter_key_start;
+              },
+              [&](int accum_m, int accum_n, int idx) {
+                if (accum_n > last_col) {
+                  accum[idx] =
+                      -cutlass::platform::numeric_limits<accum_t>::infinity();
+                }
+              },
+              [&](int accum_m) {});
+        }
+        DISPATCH_BOOL(
+            iter_key_start == 0, kIsFirst, ([&] {
+              DISPATCH_BOOL(
+                  num_keys - iter_key_start >= kKeysPerBlock,
+                  kFullColumns,
+                  ([&] {
+                    // Update `mi` from accum stored in registers
+                    // Also updates `accum` with accum[i] <-
+                    // exp(accum[i] * scale
+                    // - mi)
+                    MM0::ScalingCoefsUpdater::update<
+                        kQueriesPerBlock,
+                        kFullColumns,
+                        kIsFirst,
+                        kKeepOutputInRF>(
+                        accum_o,
+                        accum,
+                        mi,
+                        m_prime,
+                        s_prime,
+                        lane_id(),
+                        thread_id(),
+                        warp_id(),
+                        num_keys - iter_key_start,
+                        iteratorC_tile_offset,
+                        1.0f / cutlass::fast_sqrt(float(problem_size0.k())));
+                  }));
+            }));
+
+        // Output results to shared-memory
+        int warp_idx_mn_0 = warp_id() %
+            (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+        auto output_tile_coords = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+        MM0::B2bGemm::accumToSmem(
+            shared_storage.after_mm0.si, accum, lane_id(), output_tile_coords);
+
+        __syncthreads();
+
+        //
+        // MATMUL: Attn . V
+        // Run the matmul `attn @ V` for a block of attn and V.
+        // `attn` is read from shared memory (in `shared_storage_si`)
+        // `V` is read from global memory (with iterator_B)
+        //
+
+        const int64_t nBlockN = kKeepOutputInRF
+            ? 1
+            : ceil_div(
+                  (int64_t)problem_size_1_n,
+                  int64_t(MM1::ThreadblockShape::kN));
+
+        // Iterate over the N dimension of GEMM1
+        for (int blockN = 0; blockN < nBlockN; ++blockN) {
+          int gemm_k_iterations = (problem_size_1_k + MM1::Mma::Shape::kK - 1) /
+              MM1::Mma::Shape::kK;
+
+          // Compute threadblock-scoped matrix multiply-add and store it in
+          // accum (in registers)
+          if (!kPreloadV) {
+            __syncthreads(); // we share shmem between mma and epilogue
+          }
+
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{
+                  MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] +
+                  iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          typename MM1::Mma mma_pv(
+              shared_storage.after_mm0.mm1.mm,
+              shared_storage.after_mm0.si,
+              (int)thread_id(),
+              (int)warp_id(),
+              (int)lane_id(),
+              (int)problem_size_1_k);
+
+          mma_pv.set_prologue_done(kPreloadV);
+          if (!kKeepOutputInRF) {
+            accum_o.clear();
+          }
+
+          mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+          __syncthreads();
+
+          if (kPreloadV && !kKeepOutputInRF && blockN + 1 < nBlockN) {
+            prologueV(blockN + 1);
+          }
+
+          if (!kKeepOutputInRF) {
+            DISPATCH_BOOL(
+                iter_key_start == 0, kIsFirst, ([&] {
+                  DISPATCH_BOOL(
+                      (iter_key_start + kKeysPerBlock) >= num_keys,
+                      kIsLast,
+                      ([&] {
+                        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                        using DefaultOp =
+                            typename MM1::DefaultConfig::EpilogueOutputOp;
+                        using ElementCompute =
+                            typename DefaultOp::ElementCompute;
+                        using EpilogueOutputOp = typename cutlass::epilogue::
+                            thread::MemoryEfficientAttentionNormalize<
+                                typename cutlass::platform::conditional<
+                                    kIsLast,
+                                    output_t,
+                                    output_accum_t>::type,
+                                output_accum_t,
+                                DefaultOp::kCount,
+                                typename DefaultOp::ElementAccumulator,
+                                output_accum_t,
+                                kIsFirst,
+                                kIsLast,
+                                cutlass::
+                                    Array<ElementCompute, kQueriesPerBlock>>;
+                        using Epilogue = typename cutlass::epilogue::
+                            threadblock::EpiloguePipelined<
+                                typename DefaultEpilogue::Shape,
+                                typename MM1::Mma::Operator,
+                                DefaultEpilogue::kPartitionsK,
+                                typename cutlass::platform::conditional<
+                                    kIsLast,
+                                    typename MM1::OutputTileIterator,
+                                    typename MM1::OutputTileIteratorAccum>::
+                                    type,
+                                typename DefaultEpilogue::
+                                    AccumulatorFragmentIterator,
+                                typename DefaultEpilogue::WarpTileIterator,
+                                typename DefaultEpilogue::SharedLoadIterator,
+                                EpilogueOutputOp,
+                                typename DefaultEpilogue::Padding,
+                                DefaultEpilogue::kFragmentsPerIteration,
+                                true, // IterationsUnroll
+                                typename MM1::
+                                    OutputTileIteratorAccum // Read
+                                                            // iterator
+                                >;
+
+                        int col = blockN * MM1::Mma::Shape::kN;
+                        auto source_iter = createOutputAccumIter(col);
+                        auto dest_iter = gemm_kernel_utils::call_conditional<
+                            kIsLast,
+                            decltype(createOutputIter),
+                            decltype(createOutputAccumIter)>::
+                            apply(createOutputIter, createOutputAccumIter, col);
+                        EpilogueOutputOp rescale(s_prime, m_prime);
+                        Epilogue epilogue(
+                            shared_storage.epilogue_shared_storage(),
+                            thread_id(),
+                            warp_id(),
+                            lane_id());
+                        epilogue(rescale, dest_iter, accum_o, source_iter);
+                      }));
+                }));
+            if (!kKeepOutputInRF) {
+              __syncthreads();
+            }
+          }
+        }
+        __syncthreads(); // we modify `m_prime` after
+      }
+
+      if (kKeepOutputInRF) {
+        const bool kIsFirst = true;
+        const bool kIsLast = true;
+        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+        using ElementCompute = typename DefaultOp::ElementCompute;
+        using EpilogueOutputOp = typename cutlass::epilogue::thread::
+            MemoryEfficientAttentionNormalize<
+                output_t, // output
+                output_accum_t, // source
+                DefaultOp::kCount,
+                typename DefaultOp::ElementAccumulator, // accum
+                output_accum_t, // compute
+                kIsFirst,
+                kIsLast,
+                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+        using Epilogue =
+            typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator, // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true, // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum // source tile
+                >;
+        auto dest_iter = createOutputIter(0);
+        EpilogueOutputOp rescale(s_prime, m_prime);
+        Epilogue epilogue(
+            shared_storage.epilogue_shared_storage(),
+            thread_id(),
+            warp_id(),
+            lane_id());
+        epilogue(rescale, dest_iter, accum_o);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h b/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h
new file mode 100644
index 000000000..70be8e589
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/fmha_grouped_problem_visitor.h
@@ -0,0 +1,186 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped FMHA
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+#include "cutlass/matrix_coord.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels
+template <typename ThreadblockShape>
+struct FMHAGroupedProblemSizeHelper {
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(
+      const cutlass::gemm::GemmCoord& problem) {
+    // FMHA only partitions tiles across the M dimension.
+    return cutlass::gemm::GemmCoord(
+        ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM),
+        1,
+        1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <
+    typename ThreadblockShape,
+    GroupScheduleMode GroupScheduleMode_,
+    int PrefetchTileCount,
+    int ThreadCount,
+    bool Transposed = false>
+struct FMHAGroupedProblemVisitor
+    : public GroupedProblemVisitor<
+          detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>,
+          ThreadblockShape,
+          GroupScheduleMode_,
+          PrefetchTileCount,
+          ThreadCount> {
+  using ProblemSizeHelper =
+      detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<
+      ProblemSizeHelper,
+      ThreadblockShape,
+      GroupScheduleMode_,
+      PrefetchTileCount,
+      ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  cutlass::gemm::GemmCoord const* problem_sizes0;
+  cutlass::gemm::GemmCoord const* problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const* problem_sizes0;
+    cutlass::gemm::GemmCoord const* problem_sizes1;
+    int32_t problem_count;
+    void const* workspace;
+    int32_t tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : problem_sizes0(nullptr),
+          problem_sizes1(nullptr),
+          problem_count(0),
+          workspace(nullptr),
+          tile_count(0) {}
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+        cutlass::gemm::GemmCoord const* problem_sizes0,
+        cutlass::gemm::GemmCoord const* problem_sizes1,
+        int32_t problem_count,
+        void const* workspace = nullptr,
+        int32_t tile_count = 0)
+        : problem_sizes0(problem_sizes0),
+          problem_sizes1(problem_sizes1),
+          problem_count(problem_count),
+          workspace(workspace),
+          tile_count(tile_count) {}
+
+    /// Convert the FMHA-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+      return BaseParams( // Set problem_sizes as problem_sizes1 because these
+                         // determine shape of the final output of FMHA
+          problem_sizes1,
+          problem_count,
+          workspace,
+          tile_count);
+    }
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  FMHAGroupedProblemVisitor(
+      Params const& params_,
+      SharedStorage& shared_storage_,
+      int32_t block_idx)
+      : Base(params_.to_base(), shared_storage_, block_idx),
+        problem_sizes0(params_.problem_sizes0),
+        problem_sizes1(params_.problem_sizes1) {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma.h
new file mode 100644
index 000000000..a85c5b2fa
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma.h
@@ -0,0 +1,125 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int Stages,
+    cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaMultistage<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        CacheOpA,
+        IteratorB,
+        SmemIteratorB,
+        CacheOpB,
+        ElementC,
+        LayoutC,
+        Policy,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK> {
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStages =
+      kMaxK == cutlass::platform::numeric_limits<int>::max()
+      ? Stages
+      : cutlass::const_min(
+            Stages,
+            (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+  using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      CacheOpA,
+      IteratorB,
+      SmemIteratorB,
+      CacheOpB,
+      ElementC,
+      LayoutC,
+      Policy,
+      kStages,
+      SharedMemoryClear,
+      kMaxK>;
+};
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaPipelined<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        IteratorB,
+        SmemIteratorB,
+        ElementC,
+        LayoutC,
+        Policy>,
+    kMaxK> {
+  using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      IteratorB,
+      SmemIteratorB,
+      ElementC,
+      LayoutC,
+      Policy>;
+};
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h
new file mode 100644
index 000000000..6c6d07819
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_base.h
@@ -0,0 +1,183 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template <typename Element, typename OperandShape, typename OperandLayout>
+  struct OperandSharedStorage {
+    AlignedBuffer<Element, OperandShape::kCount> buffer;
+    using TensorRef = TensorRef<Element, OperandLayout>;
+
+    CUTLASS_DEVICE
+    static OperandLayout Layout() {
+      return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+    }
+
+    /// Returns a TensorRef to the operand
+    CUTLASS_HOST_DEVICE
+    TensorRef ref() {
+      return TensorRef{buffer.data(), Layout()};
+    }
+  };
+
+  /// Shape of the A matrix operand in shared memory
+  using ShapeA = MatrixShape<
+      Shape::kM + Policy::SmemPaddingA::kRow,
+      Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+  /// Shape of the B matrix operand in shared memory
+  using ShapeB = MatrixShape<
+      Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+      Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+  using SharedStorageA = OperandSharedStorage<
+      typename Operator::ElementA,
+      ShapeA,
+      typename Operator::LayoutA>;
+  using SharedStorageB = OperandSharedStorage<
+      typename Operator::ElementB,
+      ShapeB,
+      typename Operator::LayoutB>;
+  using TensorRefA = typename SharedStorageA::TensorRef;
+  using TensorRefB = typename SharedStorageB::TensorRef;
+
+  struct SharedStorage {
+    /// Buffer for A operand
+    SharedStorageA operand_A;
+
+    /// Buffer for B operand
+    SharedStorageB operand_B;
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorageA& shared_storageA,
+      SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h
new file mode 100644
index 000000000..e5cdc88fa
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_multistage.h
@@ -0,0 +1,767 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+  };
+
+  static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireMat ? Stages : Stages - 1;
+
+ private:
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  bool prologue_done_;
+
+  // Set to `True` to ensure the accumulator will be zero outside the GEMM
+  // footprint
+  bool zero_outside_bounds_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx),
+        prologue_done_(false),
+        zero_outside_bounds_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaMultistage(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    zero_outside_bounds_ = value;
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+    SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+    int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+    _prologue<kLoadA, kLoadB>(
+        iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int group_start_A = 0,
+      int group_start_B = 0) {
+    iterator_A.set_iteration_index(
+        group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(
+        group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess /
+            IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void _prologue(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int32_t& gemm_k_iterations,
+      SmemIteratorA& smem_iterator_A_,
+      SmemIteratorB& smem_iterator_B_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          if (kLoadA) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          if (kLoadB) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+
+        ++smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      smem_iterator_A_.add_tile_offset({0, 1});
+      smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    if (!prologue_done_) {
+      _prologue<true, true>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else if (!kSmemContainsEntireMat) {
+      _prologue<false, false>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else {
+      gemm_k_iterations -= kNumStagesConcurrentLoad;
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint
+    // are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared
+      /// memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared
+      /// memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(
+        warp_transformed_frag_A[0],
+        warp_transformed_frag_B[0],
+        warp_loaded_frag_A[0],
+        warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        // In case of a non-circular buffer ("kSmemContainsEntireMat")
+        // make sure we don't load out of bounds data.
+        if (!kSmemContainsEntireMat ||
+            gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          this->warp_tile_iterator_A_.load(
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+        }
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              warp_loaded_frag_A[warp_mma_k % 2],
+              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma(
+              tmp_accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+              accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (!kSmemContainsEntireMat &&
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(
+              iterator_A,
+              iterator_B,
+              group_start_iteration_A,
+              group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          if (!kSmemContainsEntireMat) {
+            int group_start_iteration_A, group_start_iteration_B;
+            group_start_iteration_A =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+            group_start_iteration_B =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+            copy_tiles_and_advance(
+                iterator_A,
+                iterator_B,
+                group_start_iteration_A,
+                group_start_iteration_B);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (!kSmemContainsEntireMat &&
+              smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+      // commit and drain all pending and predicated cp.async pnz from the GEMM
+      // mainloop
+      cutlass::arch::cp_async_fence();
+      cutlass::arch::cp_async_wait<0>();
+      __syncthreads();
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h b/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h
new file mode 100644
index 000000000..73112e9a2
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm/custom_mma_pipelined.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<
+        typename SmemIteratorA_::Element,
+        typename IteratorA_::Element,
+        IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA =
+      IteratorA_; ///< Iterates over tiles of A operand in global memory
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+  static bool const kSmemContainsEntireMat = false;
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx ///< ID of each thread within a warp
+      )
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaPipelined(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    // NOT NEEDED FOR PIPELINED
+    // shared memory will always be zero-filled
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      IteratorA iterator_A, ///< iterator over A operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      TransformA transform_A =
+          TransformA(), ///< transformation applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h b/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h
new file mode 100644
index 000000000..931a4bb2a
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/gemm_kernel_utils.h
@@ -0,0 +1,296 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                        \
+  {                                                                         \
+    if (query.scalar_type() == at::ScalarType::Float) {                     \
+      using scalar_t = float;                                               \
+      func();                                                               \
+    } else if (query.scalar_type() == at::ScalarType::Half) {               \
+      using scalar_t = cutlass::half_t;                                     \
+      func();                                                               \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {           \
+      using scalar_t = cutlass::bfloat16_t;                                 \
+      func();                                                               \
+    } else {                                                                \
+      TORCH_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                       \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      constexpr bool BOOL_NAME = true;      \
+      F();                                  \
+    } else {                                \
+      constexpr bool BOOL_NAME = false;     \
+      F();                                  \
+    }                                       \
+  }
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      TORCH_CHECK(                                                        \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                         \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                     \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef HAS_PYTORCH
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)   \
+  if (!(COND)) {                    \
+    std::cerr << #COND " failed\n"; \
+    return false;                   \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                \
+  {                                                                \
+    A = B;                                                         \
+    TORCH_CHECK(                                                   \
+        B < cutlass::platform::numeric_limits<decltype(A)>::max(), \
+        #B " overflows");                                          \
+  }
+
+namespace gemm_kernel_utils {
+
+#ifdef HAS_PYTORCH
+template <typename scalar_t>
+struct TypeTraits;
+
+template <>
+struct TypeTraits<cutlass::half_t> {
+  using scalar_t = cutlass::half_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Half;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return at::PackedTensorAccessor32<scalar_t, nDim>(
+        (scalar_t*)(tensor.data_ptr()),
+        tensor.sizes().data(),
+        tensor.strides().data());
+  }
+};
+
+template <>
+struct TypeTraits<cutlass::bfloat16_t> {
+  using scalar_t = cutlass::bfloat16_t;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::BFloat16;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return at::PackedTensorAccessor32<scalar_t, nDim>(
+        (scalar_t*)(tensor.data_ptr()),
+        tensor.sizes().data(),
+        tensor.strides().data());
+  }
+};
+
+template <>
+struct TypeTraits<float> {
+  using scalar_t = float;
+
+  static constexpr __host__ at::ScalarType atScalarType() {
+    return at::ScalarType::Float;
+  }
+  template <int nDim>
+  static __host__ at::PackedTensorAccessor32<scalar_t, nDim> packed_accessor(
+      at::Tensor const& tensor) {
+    return tensor.packed_accessor32<scalar_t, nDim>();
+  }
+};
+#endif
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE int32_t warp_uniform(int32_t value) {
+  return (int32_t)__shfl_sync(0xffffffff, (unsigned)value, 0);
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h b/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 000000000..44f38dbcb
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,752 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h
new file mode 100644
index 000000000..3926cc1a0
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/make_residual_last.h
@@ -0,0 +1,98 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 000000000..d49bf83e9
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2115 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params const& params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 000000000..4bb96a139
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2120 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Acceses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/kernels/mem_eff_attention/kernel_forward.h b/static/include/kernels/mem_eff_attention/kernel_forward.h
new file mode 100644
index 000000000..3ca842acb
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/kernel_forward.h
@@ -0,0 +1,950 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holdvr nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+// This code has been adapted from
+// https://github.com/NVIDIA/cutlass/blob/77549ae6c8cf31c7ac4c8b88180a8708a8683da4/examples/41_fused_multi_head_attention/kernel_forward.h
+
+#pragma once
+
+#ifdef HAS_PYTORCH
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+#endif
+
+#include <cmath>
+#include <vector>
+
+#include "cutlass/bfloat16.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+
+#include "debug_utils.h"
+#include "epilogue_pipelined.h"
+#include "epilogue_rescale_output.h"
+#include "find_default_mma.h"
+#include "gemm_kernel_utils.h"
+#include "mma_from_smem.h"
+
+#include <inttypes.h>
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSm() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+} // namespace
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    bool kSingleValueIteration // = `value.shape[-1] <= kKeysPerBlock`
+    >
+struct AttentionKernel {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using lse_scalar_t = float;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr int32_t kAlignLSE = 32; // block size of backward
+  static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
+      cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
+    int32_t* seqlens_q_ptr = nullptr;
+    int32_t* seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
+    output_accum_t*
+        output_accum_ptr; // [num_queries, num_heads, head_dim_value]
+    lse_scalar_t* logsumexp_ptr; // [num_heads, num_queries] - can be null
+
+    // Dimensions/strides
+    int32_t head_dim;
+    int32_t head_dim_value;
+    int32_t num_queries;
+    int32_t num_keys;
+
+    bool causal;
+
+    int32_t q_strideM;
+    int32_t k_strideM;
+    int32_t v_strideM;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH;
+    int32_t k_strideH;
+    int32_t v_strideH;
+    int32_t o_strideH;
+    int64_t q_strideB;
+    int64_t k_strideB;
+    int64_t v_strideB;
+    int64_t o_strideB;
+    int32_t num_batches;
+    int32_t num_heads;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      // Note: Note in sync with cutlass' main branch!! Make sure to apply
+      // when updating cutlass.
+      return head_dim_value * num_heads;
+    }
+
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+      // In case of variable target sequence lengths, get real sequence length
+      // for this batch
+      if (seqlens_q_ptr != nullptr) {
+        num_queries = seqlens_q_ptr[batch_id];
+        if (query_start >= num_queries) {
+          return false;
+        }
+      }
+      // In case of variable source sequence lengths, get real sequence length
+      // for this batch
+      if (seqlens_k_ptr != nullptr) {
+        num_keys = seqlens_k_ptr[batch_id];
+      }
+
+      query_ptr += batch_id * q_strideB;
+      key_ptr += batch_id * k_strideB;
+      value_ptr += batch_id * v_strideB;
+      output_ptr += batch_id * o_strideB;
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr += batch_id * o_strideB;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += query_start * q_strideM + head_id * q_strideH;
+      key_ptr += head_id * k_strideH;
+      value_ptr += head_id * v_strideH;
+      output_ptr += int64_t(query_start) * o_strideM() + head_id * o_strideH;
+
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr +=
+            int64_t(query_start) * o_strideM() + head_id * o_strideH;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+      if (logsumexp_ptr != nullptr) {
+        // lse[batch_id, head_id, query_start]
+        logsumexp_ptr +=
+            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+      }
+
+      num_queries -= query_start;
+      if (causal) {
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + kQueriesPerBlock), num_keys);
+      }
+      num_batches = 0; // no longer used after
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      head_dim = warp_uniform(head_dim);
+      head_dim_value = warp_uniform(head_dim_value);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        DefaultConfig::kStages, // Should use `DefaultConfig::kStages`, but that
+                                // uses too much smem
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using Mma = typename DefaultMma::ThreadblockMma;
+    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Updater;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            output_accum_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            typename MM0::AccumulatorSharedStorage>;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+
+    struct SharedStorageMM1 {
+      typename Mma::SharedStorage mm;
+    };
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> mi;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::SharedStorageMM1 mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0, "value is not correctly aligned");
+    XFORMERS_CHECK(
+        p.q_strideH % kAlignmentQ == 0, "query is not correctly aligned");
+    XFORMERS_CHECK(
+        p.k_strideH % kAlignmentK == 0, "key is not correctly aligned");
+    XFORMERS_CHECK(
+        p.v_strideH % kAlignmentV == 0, "value is not correctly aligned");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (thread_id() < kQueriesPerBlock) {
+      s_prime[thread_id()] = accum_t(0);
+      m_prime[thread_id()] =
+          -cutlass::platform::numeric_limits<accum_t>::infinity();
+      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+    }
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM()},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)p.o_strideM()},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+    // Iterate through keys
+    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1.mm,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                       // updated from end of prev iter
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_id();
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // Mask out last if causal
+      if (p.causal && p.num_keys - iter_key_start <= kKeysPerBlock) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
+            lane_id(), warp_id(), iteratorC_tile_offset);
+        int32_t last_col;
+        MM0::ScalingCoefsUpdater::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              last_col = query_start + accum_m - iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n > last_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+      DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+                      DISPATCH_BOOL(
+                          p.num_keys - iter_key_start >= kKeysPerBlock,
+                          kFullColumns,
+                          ([&] {
+                            // Update `mi` from accum stored in registers
+                            // Also updates `accum` with accum[i] <-
+                            // exp(accum[i] * scale
+                            // - mi)
+                            MM0::ScalingCoefsUpdater::update<
+                                kQueriesPerBlock,
+                                kFullColumns,
+                                kIsFirst,
+                                kKeepOutputInRF>(
+                                accum_o,
+                                accum,
+                                mi,
+                                m_prime,
+                                s_prime,
+                                lane_id(),
+                                thread_id(),
+                                warp_id(),
+                                p.num_keys - iter_key_start,
+                                iteratorC_tile_offset,
+                                1.0f / cutlass::fast_sqrt(float(p.head_dim)));
+                          }));
+                    }));
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            shared_storage.after_mm0.mm1.mm,
+            shared_storage.after_mm0.si,
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id(),
+            (int)problem_size_1_k);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          DISPATCH_BOOL(
+              iter_key_start == 0, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp = typename cutlass::epilogue::
+                          thread::MemoryEfficientAttentionNormalize<
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  output_t,
+                                  output_accum_t>::type,
+                              output_accum_t,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              kIsFirst,
+                              kIsLast,
+                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp rescale(s_prime, m_prime);
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          warp_id(),
+                          lane_id());
+                      epilogue(rescale, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads(); // we modify `m_prime` after
+    }
+
+    if (kKeepOutputInRF) {
+      constexpr bool kIsFirst = true;
+      constexpr bool kIsLast = true;
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      using EpilogueOutputOp =
+          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+              output_t, // output
+              output_accum_t, // source
+              DefaultOp::kCount,
+              typename DefaultOp::ElementAccumulator, // accum
+              output_accum_t, // compute
+              kIsFirst,
+              kIsLast,
+              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+      using Epilogue =
+          typename cutlass::epilogue::threadblock::EpiloguePipelined<
+              typename DefaultEpilogue::Shape,
+              typename MM1::Mma::Operator,
+              DefaultEpilogue::kPartitionsK,
+              typename MM1::OutputTileIterator, // destination
+              typename DefaultEpilogue::AccumulatorFragmentIterator,
+              typename DefaultEpilogue::WarpTileIterator,
+              typename DefaultEpilogue::SharedLoadIterator,
+              EpilogueOutputOp,
+              typename DefaultEpilogue::Padding,
+              DefaultEpilogue::kFragmentsPerIteration,
+              true, // IterationsUnroll
+              typename MM1::OutputTileIteratorAccum // source tile
+              >;
+      auto dest_iter = createOutputIter(0);
+      EpilogueOutputOp rescale(s_prime, m_prime);
+      Epilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      epilogue(rescale, dest_iter, accum_o);
+    }
+
+    // 7. Calculate logsumexp
+    // To make the backward easier, we pad logsumexp with `inf`
+    // this avoids a few bound checks, and is not more expensive during fwd
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+      if (thread_id() < p.num_queries) {
+        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()]) +
+            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+      } else if (thread_id() < lse_dim) {
+        p.logsumexp_ptr[thread_id()] =
+            cutlass::platform::numeric_limits<accum_t>::infinity();
+      }
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
+
+#define _ATTENTION_KERNEL_FORWARD_BEGIN(...)                                  \
+  template <>                                                                 \
+  __global__ void __launch_bounds__(                                          \
+      __VA_ARGS__::kNumThreads, __VA_ARGS__::kMinBlocksPerSm)                 \
+      attention_kernel_batched<__VA_ARGS__>(typename __VA_ARGS__::Params p) { \
+    using Kernel = __VA_ARGS__;
+#define _ATTENTION_KERNEL_FORWARD_END() }
+
+#ifdef __CUDA_ARCH__
+#define __CUDA_ARCH_OR_ZERO__ __CUDA_ARCH__
+#else
+#define __CUDA_ARCH_OR_ZERO__ 0
+#endif
+
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD(              \
+    ARCH,                                                  \
+    SCALAR_T,                                              \
+    IS_ALIGNED,                                            \
+    QUERIES_PER_BLOCK,                                     \
+    KEYS_PER_BLOCK,                                        \
+    SINGLE_VALUE_ITER)                                     \
+  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<         \
+                                  SCALAR_T,                \
+                                  cutlass::arch::Sm##ARCH, \
+                                  IS_ALIGNED,              \
+                                  QUERIES_PER_BLOCK,       \
+                                  KEYS_PER_BLOCK,          \
+                                  SINGLE_VALUE_ITER>)      \
+  if (!p.advance_to_block()) {                             \
+    return;                                                \
+  }                                                        \
+  Kernel::attention_kernel(p);                             \
+  _ATTENTION_KERNEL_FORWARD_END();
+
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(              \
+    ARCH,                                                           \
+    SCALAR_T,                                                       \
+    IS_ALIGNED,                                                     \
+    QUERIES_PER_BLOCK,                                              \
+    KEYS_PER_BLOCK,                                                 \
+    SINGLE_VALUE_ITER)                                              \
+  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<                  \
+                                  SCALAR_T,                         \
+                                  cutlass::arch::Sm##ARCH,          \
+                                  IS_ALIGNED,                       \
+                                  QUERIES_PER_BLOCK,                \
+                                  KEYS_PER_BLOCK,                   \
+                                  SINGLE_VALUE_ITER>)               \
+  printf(                                                           \
+      "FATAL: this function is for sm%d, but was built for sm%d\n", \
+      int(ARCH),                                                    \
+      int(__CUDA_ARCH_OR_ZERO__));                                  \
+  _ATTENTION_KERNEL_FORWARD_END();
+
+// All kernels are disabled by default
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__)
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__)
+
+// Enable the right one based on __CUDA_ARCH__
+#ifndef __CUDA_ARCH__
+#elif __CUDA_ARCH__ < 500
+#error "Need cuda arch at least 5.0"
+#elif __CUDA_ARCH__ < 700
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__)
+#elif __CUDA_ARCH__ < 750
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__)
+#elif __CUDA_ARCH__ < 800
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__)
+#elif __CUDA_ARCH__ >= 800
+#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80
+#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
+  INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__)
+#endif
diff --git a/static/include/kernels/mem_eff_attention/mma_from_smem.h b/static/include/kernels/mem_eff_attention/mma_from_smem.h
new file mode 100644
index 000000000..21ac4d104
--- /dev/null
+++ b/static/include/kernels/mem_eff_attention/mma_from_smem.h
@@ -0,0 +1,1780 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "attention_scaling_coefs_updater.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "epilogue_thread_apply_logsumexp.h"
+#include "gemm_kernel_utils.h"
+#include "iterators/make_residual_last.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum value for K
+    int kMaxK,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA =
+      TensorRef<typename Operator::ElementA, typename Operator::LayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorage& shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(shared_storage.operand_B_ref(), lane_idx) {}
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    // END smem
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         AccumulatorSharedStorage::Shape::kN,
+                                         Policy_,
+                                         2> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy_,
+      2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx, ///< ID of each thread within a warp
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(accumulator_shared_storage.accum_ref(), lane_idx),
+        smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async tranfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tighest latency requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    // Accumulator type
+    typename AccumulatorSharedStorage,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory<
+                                          Shape1_,
+                                          AccumulatorSharedStorage::Shape::kN,
+                                          Policy1_,
+                                          Stages_> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape1_,
+      AccumulatorSharedStorage::Shape::kN,
+      Policy1_,
+      Stages_>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::SharedStorage&
+          shared_storage, ///< Shared storage needed for internal use by
+                          ///< threadblock-scoped GEMM
+      AccumulatorSharedStorage& accumulator_shared_storage,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx,
+      ///< GEMM0 N is used for accumulator extent
+      int problem_size_0_n)
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(
+            accumulator_shared_storage.accum_ref(),
+            lane_idx),
+        smem_iterator_B1_(shared_storage.operand_B_ref(), thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        warp_loaded_frag_A1[0],
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              warp_loaded_frag_A1[warp_mma_k % 2],
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <typename Mma_, typename AccumulatorSharedStorage>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    typename AccumulatorSharedStorage_>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    AccumulatorSharedStorage_> {
+  static constexpr int kWarpSize = 32;
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      AccumulatorSharedStorage_,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    typename AccumulatorSharedStorage_>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    AccumulatorSharedStorage_> {
+  static constexpr int kWarpSize = 32;
+
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorA = typename DefaultWarpIteratorAFromSharedMemory<
+      WarpShape,
+      InstructionShape,
+      typename RegularMma::Operator::IteratorA,
+      Policy_>::WarpIterator;
+
+  static int constexpr kMaxK = AccumulatorSharedStorage_::Shape::kN;
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          AccumulatorSharedStorage_,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  // Epilogue 2: with LSE (for backwards pass)
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+  using IteratorAccumulatorLSE =
+      cutlass::transform::threadblock::VectorIterator<
+          cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+              // Shape
+              cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+              // WarpShape
+              cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+              lse_scalar_t,
+              cutlass::layout::RowMajor,
+              kElementsPerAccess>>;
+  using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+      scalar_t, // ElementOutput_
+      lse_scalar_t, // ElementLSE_
+      accum_t, // ElementAccumulator_
+      accum_t, // ElementCompute_
+      128 / cutlass::sizeof_bits<scalar_t>::value
+      // FragmentIteratorAccumulator::Fragment::kElements
+      // InstructionShape::kM * InstructionShape::kN / 32
+      >;
+  using EpilogueWithLSE =
+      cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+          SmemIteratorD0,
+          FragmentIteratorAccumulator,
+          IteratorAccumulatorLSE,
+          EpilogueOpApplyLSE>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC& accum,
+      lse_scalar_t const* lse,
+      int32_t lse_extents,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    constexpr int32_t kAlignLSE = 32;
+    IteratorAccumulatorLSE iterator_lse(
+        lse,
+        {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+        thread_id,
+        warp_id,
+        cutlass::MatrixCoord{0, 0} // offset
+    );
+
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    EpilogueWithLSE epilogue;
+    EpilogueOpApplyLSE minus_lse_exp({});
+    epilogue(
+        minus_lse_exp,
+        smem_iterator_attn,
+        accum,
+        // scale - unused
+        iterator_lse,
+        // bias
+        iterator_lse);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+  using SmemIteratorD0 = cutlass::epilogue::warp::TileIteratorVoltaTensorOp<
+      WarpShape,
+      cutlass::gemm::GemmShape<32, 32, 4>,
+      scalar_t,
+      SmemAccumulatorLayout>;
+
+  // // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<
+              16,
+              32>, // typename SmemIteratorD0::TensorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  using OutputLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using TensorRef = cutlass::TensorRef<scalar_t, OutputLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using RegistersIter = typename DefaultAttentionScalingCoefsUpdater<
+        IteratorC,
+        accum_t,
+        WarpSize>::Updater;
+    auto lane_offset =
+        RegistersIter::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    RegistersIter::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using RegistersIter = typename DefaultAttentionScalingCoefsUpdater<
+        IteratorC,
+        accum_t,
+        WarpSize>::Updater;
+    auto lane_offset =
+        RegistersIter::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    RegistersIter::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/static/include/macros.h b/static/include/macros.h
index 84bf47155..462b25255 100644
--- a/static/include/macros.h
+++ b/static/include/macros.h
@@ -22,8 +22,15 @@
 #define DEVICE_CHECK(call)                                           \
   if ((call) != GetDeviceSuccess()) {                                \
     throw std::runtime_error(                                        \
-        #call " API call failed: " + GetLastErrorString() + " at " + \
+        #call " API call failed: " + GetErrorString(call) + " at " + \
         __FILE__ + ", line" + std::to_string(__LINE__));             \
   }
 
 #define LAUNCH_CHECK() DEVICE_CHECK(GetLastError())
+
+#define CHECK_VECTOR_ACCESS(vector, idx)                                 \
+  if (idx >= vector.size()) {                                            \
+    throw std::out_of_range(                                             \
+        "[__func__]: index out of range, " #vector ".size()=" +          \
+        std::to_string(vector.size()) + ", got " + std::to_string(idx)); \
+  }
diff --git a/static/include/model.h b/static/include/model.h
new file mode 100644
index 000000000..972c6bbf3
--- /dev/null
+++ b/static/include/model.h
@@ -0,0 +1,346 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+namespace ait {
+
+inline void DeviceCheckLastError(const char* file, int line) {
+  auto device_error = GetLastError();
+  if (device_error != GetDeviceSuccess()) {
+    std::string msg = std::string("Got error: ") + GetErrorString(device_error) +
+        " enum: " + std::to_string(device_error) + " at " + file + ": " +
+        std::to_string(line);
+    LOG(ERROR) << msg;
+    throw std::runtime_error(msg);
+  }
+}
+
+// This serves as a base class for AIT runtime objects, e.g. the compiled
+// model and the constant folder. It uses CRTP as a mechanism to call into
+// a few base class methods (dynamic dispatch is not needed in ModelContainer,
+// so there's no need to add a vtable). Inheriting classes should implement
+// the following methods:
+// - RunImpl(StreamType):    The bulk of the compiled model's kernel invocations
+//                           go here.
+// - SetUpInputsOutputs():   Check the provided input/output pointers dtypes &
+//                           sizes
+// - DeviceToDeviceCopies(): Called at the end of infernece, copy views of
+//                           inputs/constants to the provided output pointer.
+//
+// In practice, inheriting classes are generated via MODEL_TEMPLATE in
+// python/aitemplate/backend/main_templates.py.
+template <typename ModelType>
+class ModelBase {
+ protected:
+  // Should not be constructed directly, use the base class' factory function
+  // instead.
+  ModelBase(
+      size_t blob_size,
+      size_t workspace_size,
+      size_t unique_workspace_size,
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_unbound_constants,
+      uint8_t* constants,
+      AITemplateAllocator& allocator)
+      : blob_(RAII_DeviceMalloc(blob_size, allocator)),
+        workspace_(RAII_DeviceMalloc(workspace_size, allocator)),
+        params_(num_inputs + num_outputs + num_unbound_constants),
+        workspace_size_{workspace_size},
+        unique_workspace_size_{unique_workspace_size},
+        num_inputs_(num_inputs),
+        num_outputs_(num_outputs),
+        constants_(constants) {
+    global_workspace_ =
+        static_cast<uint8_t*>(workspace_.get()) + unique_workspace_size;
+    unique_workspace_ = static_cast<uint8_t*>(workspace_.get());
+    DEVICE_CHECK(GetDevice(&device_idx_))
+    DEVICE_CHECK(CreateEvent(&run_finished_));
+#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+    DEVICE_CHECK(cudaDeviceGetAttribute(
+        &max_smem_size_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_idx_));
+#endif
+    DEVICE_CHECK(GetDeviceProperties(&device_properties_, device_idx_));
+    DEVICE_CHECK(StreamCreate(&graph_capture_stream_, /*non_blocking=*/true));
+  }
+
+ public:
+  virtual ~ModelBase() {
+    if (run_finished_ != nullptr) {
+      DestroyEvent(run_finished_);
+    }
+    if (graph_capture_stream_ != nullptr) {
+      StreamDestroy(graph_capture_stream_);
+    }
+    if (graph_exec_ != nullptr) {
+      GraphExecDestroy(graph_exec_);
+    }
+  }
+
+  ModelBase(ModelBase&&) = delete;
+  ModelBase& operator=(ModelBase&&) = delete;
+  ModelBase(const ModelBase&) = delete;
+  ModelBase& operator=(const ModelBase&) = delete;
+
+  void Run(StreamType stream, bool graph_mode) {
+    auto* model = static_cast<ModelType*>(this);
+    model->SetUpInputsOutputs();
+    if (target_has_graph_mode && graph_mode) {
+      RunAsGraph(stream);
+    } else {
+      model->RunImpl(stream);
+    }
+    model->DeviceToDeviceCopies(stream);
+    DEVICE_CHECK(EventRecord(run_finished_, stream));
+  }
+
+  void Profile(StreamType stream, size_t iters, const std::string& filename) {
+    auto* model = static_cast<ModelType*>(this);
+    model->SetUpInputsOutputs();
+    model->ProfileImpl(stream, iters, filename);
+  }
+
+  bool IsPending() {
+    auto query = QueryEvent(run_finished_);
+    if (query == GetDeviceNotReady()) {
+      return true;
+    }
+    if (query != GetDeviceSuccess()) {
+      LOG(WARNING) << "Pending model run did not finish successfully. Error: "
+                   << GetErrorString(query);
+    }
+    return false;
+  }
+
+  void WaitForCompletion() {
+    DEVICE_CHECK(EventSynchronize(run_finished_));
+  }
+
+  size_t NumInputs() const {
+    return num_inputs_;
+  }
+
+  size_t NumOutputs() const {
+    return num_outputs_;
+  }
+
+  void SetParam(const void* src, size_t param_idx) {
+    CHECK_VECTOR_ACCESS(params_, param_idx)
+    // const_cast is not ideal here, but it is unfortunately
+    // necessary:
+    // 1) We store outputs and inputs in the same vector,
+    //    and outputs cannot be const.
+    // 2) Most of the codegen is not const-correct (most ops
+    //    require non-const pointers). So even if we put const
+    //    pointers into params, a const_cast would be required
+    //    somewhere else.
+    params_[param_idx].ptr = const_cast<void*>(src);
+  }
+
+  void SetInput(
+      const void* src,
+      const AITemplateParamShape& shape,
+      size_t idx) {
+    SetInputShape(shape, idx);
+    SetParam(src, idx);
+  }
+
+  void SetOutput(void* src, size_t idx) {
+    SetParam(src, idx + num_inputs_);
+  }
+
+  // Write the (possibly dynamic) output shape to the given pointer.
+  // Note that this should be called _after_ the shape inference in
+  // Run() is finished. output_shape_out should be able to store
+  // at least GetOutputMaximumShape(idx).size values.
+  void GetOutputShape(size_t idx, int64_t* output_shape_out) {
+    const auto param_idx = idx + num_inputs_;
+    CHECK_VECTOR_ACCESS(params_, param_idx);
+    const auto& shape_ptrs = params_[param_idx].shape_ptrs;
+    for (size_t i = 0; i < shape_ptrs.size(); ++i) {
+      output_shape_out[i] = shape_ptrs[i].GetValue();
+    }
+  }
+
+  void SetConstant(const char* name, const void* src) {
+    auto it = constant_name_to_ptr_.find(name);
+    if (it == constant_name_to_ptr_.end()) {
+      throw std::out_of_range(std::string("Could not find constant ") + name);
+    }
+    const void** ptr = it->second;
+    *ptr = src;
+  }
+
+ private:
+  void SetInputShape(const AITemplateParamShape& shape, size_t idx) {
+    auto& param = params_[idx];
+    if (shape.size != param.shape_ptrs.size()) {
+      throw std::runtime_error(
+          "[SetInputShape] Got wrong param shape for input " +
+          std::to_string(idx) + "; expected " +
+          std::to_string(param.shape_ptrs.size()) + ", got " +
+          std::to_string(shape.size));
+    }
+    for (size_t i = 0; i < param.shape_ptrs.size(); ++i) {
+      param.shape_ptrs[i].SetValue(shape.shape_data[i], param.name);
+    }
+  }
+
+  DeviceError EndCapture(GraphType* graph_ptr) {
+    auto err = StreamEndCapture(graph_capture_stream_, graph_ptr);
+    if (err != GetDeviceSuccess()) {
+      // If we can't take the stream out of capture mode, something is probably
+      // wrong with CUDA graph for this model (e.g. there might have been an
+      // illegal capture mode operation). Disable graph mode to avoid such
+      // issues in future iterations.
+      target_has_graph_mode = false;
+      LOG(WARNING) << "Graph capture failed to end. Disabling graph mode.";
+      return err;
+    }
+    return GetDeviceSuccess();
+  }
+
+  void RunAsGraph(StreamType stream) {
+#ifdef __HIP_PLATFORM_AMD__
+    if (graph_exec_ == nullptr) {
+      DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
+      try {
+        static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
+      } catch (...) {
+        GraphType graph;
+        // No need to DEVICE_CHECK here, we want to see the original exception.
+        EndCapture(&graph);
+        if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+          LOG(WARNING)
+              << "Graph destruction failed while handling exception! Memory will be leaked.";
+        }
+        throw;
+      }
+      // The following function ends the capture and creates a graph
+      // inside a unique_ptr that cleans up it when it goes out of scope.
+      // Note that it throws an exception if EndCapture fails.
+      auto graph = RAII_EndCaptureAndCreateGraph(
+          [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    }
+#else
+    DEVICE_CHECK(StreamBeginCapture(graph_capture_stream_, /*global=*/false));
+    try {
+      static_cast<ModelType*>(this)->RunImpl(graph_capture_stream_);
+    } catch (...) {
+      GraphType graph;
+      // No need to DEVICE_CHECK here, we want to see the original exception.
+      EndCapture(&graph);
+      if (graph != nullptr && GraphDestroy(graph) != GetDeviceSuccess()) {
+        LOG(WARNING)
+            << "Graph destruction failed while handling exception! Memory will be leaked.";
+      }
+      throw;
+    }
+    // The following function ends the capture and creates a graph
+    // inside a unique_ptr that cleans up it when it goes out of scope.
+    // Note that it throws an exception if EndCapture fails.
+    auto graph = RAII_EndCaptureAndCreateGraph(
+        [this](GraphType* graph_ptr) { return EndCapture(graph_ptr); });
+    if (graph_exec_ == nullptr) {
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    } else if (
+        GraphExecUpdate(graph_exec_, graph.get()) != GetDeviceSuccess()) {
+      // Consume the last cuda error, which may affect the next GraphExecLaunch
+      // call.
+      GetLastError();
+      DEVICE_CHECK(GraphExecDestroy(graph_exec_));
+      DEVICE_CHECK(GraphInstantiate(&graph_exec_, graph.get()));
+    }
+#endif
+
+    DEVICE_CHECK(GraphExecLaunch(graph_exec_, stream));
+  }
+
+ protected:
+  int device_idx_;
+  int max_smem_size_{0};
+  DevicePropertyType device_properties_;
+  // This event tracks when the inference is finished
+  // so that this Model may be reclaimed by its owning
+  // ModelContainer.
+  EventType run_finished_;
+  // A blob of memory used for storing intermediate tensors.
+  GPUPtr blob_;
+  // Memory for constants that were folded into the *.so. Unowned by Model,
+  // owned by ModelContainer.
+  // TODO: make this const. It can't be const right now because we derive
+  // tensor pointers from it, and no tensor pointers are const.
+  uint8_t* constants_;
+  size_t num_inputs_;
+  size_t num_outputs_;
+
+  // These values are preserved for multi-stream needs.
+  size_t workspace_size_;
+  size_t unique_workspace_size_;
+  // The workspace blob is used as scratch memory. See
+  // _generate_workspace in memory planning for more information.
+  GPUPtr workspace_;
+  uint8_t* global_workspace_{nullptr};
+  uint8_t* unique_workspace_{nullptr};
+
+  class ParamDim {
+   public:
+    ParamDim(int64_t lower_bound, int64_t upper_bound, int64_t* value)
+        : lower_bound_(lower_bound), upper_bound_(upper_bound), value_(value) {}
+
+    void SetValue(int64_t new_value, const char* name = nullptr) {
+      if (new_value < lower_bound_ || new_value > upper_bound_) {
+        throw std::out_of_range(
+            "[SetValue] Dimension got value out of bounds; expected value to be in [" +
+            std::to_string(lower_bound_) + ", " + std::to_string(upper_bound_) +
+            "], but got " + std::to_string(new_value) +
+            (name ? ". Variable name: " + std::string(name) : "") + ".");
+      }
+      *value_ = new_value;
+    }
+
+    int64_t GetValue() const {
+      return *value_;
+    }
+
+   private:
+    int64_t lower_bound_;
+    int64_t upper_bound_;
+    int64_t* value_;
+  };
+
+  struct ParamInfo {
+    void* ptr = nullptr;
+    // TODO add offset
+    const char* name;
+    std::vector<ParamDim> shape_ptrs;
+  };
+
+  // Contains info for all tensors marked as inputs
+  // or outputs. The first num_inputs elements are the inputs.
+  // Constants are not included.
+  std::vector<ParamInfo> params_;
+
+  GraphExecType graph_exec_ = nullptr;
+  StreamType graph_capture_stream_;
+
+  std::unordered_map<std::string, const void**> constant_name_to_ptr_;
+};
+
+} // namespace ait
diff --git a/static/include/model_container.h b/static/include/model_container.h
index 72ea5d6f8..04c026476 100644
--- a/static/include/model_container.h
+++ b/static/include/model_container.h
@@ -14,6 +14,7 @@
 //
 #pragma once
 
+#include "constant_folder-generated.h"
 #include "model-generated.h"
 #include "model_interface.h"
 #include "raii_wrapper.h"
@@ -23,10 +24,18 @@
 #include <future>
 #include <mutex>
 #include <numeric>
+#include <shared_mutex>
 #include <unordered_map>
+#include <unordered_set>
 
 namespace ait {
 
+enum class BufferState {
+  CLEAN = 0,
+  CONSTANTS_UPDATED = 1,
+  CONSTANTS_FOLDED = 2
+};
+
 // ModelContainer inherits from this class; its implementation is
 // generated at compilation time. Most of the ModelContainer
 // logic does not need codegen; anything that does should be put
@@ -36,18 +45,48 @@ class ModelContainerBase {
   ModelContainerBase(
       size_t num_inputs,
       size_t num_outputs,
+      size_t num_bound_constants,
       size_t num_unbound_constants,
       size_t params_size,
       AITemplateAllocator& allocator);
 
  protected:
+  // The set of bounded constants/weights/parameters. These are constants which
+  // have value during compile time. We maintain it's size, and unlike unbound
+  // constants, we do not need to check whether they are set via SetConstant
+  // prior to inference.
+  std::unordered_map<std::string, size_t> bound_constant_name_to_idx_;
+
   // The set of unbound constants/weights/parameters. These are constants which
   // have no value at compile time and do not participate in constant folding.
   // They must be set via SetConstant prior to inference.
   std::unordered_map<std::string, size_t> unbound_constant_name_to_idx_;
 
-  // a single piece of memory for all constants
-  GPUPtr constants_;
+  // The names of all tensors that are required for constant folding, but are
+  // not necessarily in the final graph.
+  // constant_folding_optional_inputs_ are those that has initial value during
+  // compile time.
+  std::unordered_set<std::string> constant_folding_inputs_;
+  std::unordered_set<std::string> constant_folding_optional_inputs_;
+
+  // Offsets here correspond to the offsets of constants that were the outputs
+  // of constant folding. The indices are guaranteed to map to the correct
+  // indices in constant_folder_.
+  std::vector<size_t> constant_folding_outputs_offsets_;
+  // Offsets here correspond to the offsets of constants for bounded constants.
+  std::vector<size_t> bound_constant_offsets_;
+
+  // size for constants_ GPUPtr
+  size_t constants_size_;
+  // Pieces of memory for holding all constants, controled by
+  // use_constants_primary_buffer_
+  GPUPtr constants_primary_;
+  GPUPtr constants_secondary_;
+  bool use_constants_primary_buffer_;
+  // State of whether SetConstants/FoldConstants was called.
+  BufferState buffer_state_;
+  // Mapping for constant names to pointer
+  std::unordered_map<std::string, const void*> model_constants_;
 
   // size of the containers below: # inputs + # outputs + # unbound constants.
   size_t num_params_;
@@ -58,6 +97,10 @@ class ModelContainerBase {
   std::vector<std::vector<int64_t>> max_param_shapes_;
   std::vector<AITemplateDtype> param_dtypes_;
 
+  // These are entries used for bound constants.
+  std::vector<size_t> bound_constant_size_;
+  std::vector<AITemplateDtype> bound_constant_dtypes_;
+
   // NB: technically these could be derived from both the max shape and
   // the dytpe, but it's easier to just cache them.
   std::vector<size_t> max_param_storage_bytes_;
@@ -90,8 +133,8 @@ ModelContainer* CreateModelContainer(
 // to start up two inferences on different streams concurrently,
 // we can do this:
 //
-// model_container.Run(inputs0, num_inputs, outputs0, num_ouputs, stream0, ...);
-// model_container.Run(inputs1, num_inputs, outputs1, num_ouputs, stream1, ...);
+// model_container.Run(inputs0, n_inputs, outputs0, n_outputs, stream0, ...);
+// model_container.Run(inputs1, n_inputs, outputs1, n_outputs, stream1, ...);
 // StreamSynchronize(stream0);
 // StreamSynchronize(stream1);
 //
@@ -108,10 +151,9 @@ class ModelContainer : ModelContainerBase {
  public:
   ModelContainer(
       size_t num_models,
-      size_t blob_size,
-      size_t workspace_size,
       size_t num_inputs,
       size_t num_outputs,
+      size_t num_bound_constants,
       size_t num_unbound_constants,
       size_t params_size,
       AITemplateAllocator& allocator);
@@ -135,6 +177,15 @@ class ModelContainer : ModelContainerBase {
       bool graph_mode,
       int64_t** output_shapes_out);
 
+  void Profile(
+      const AITData* inputs,
+      size_t num_inputs,
+      AITData* outputs,
+      size_t num_outputs,
+      StreamType stream,
+      size_t num_iters,
+      const char* filename);
+
   float Benchmark(
       const AITData* inputs,
       size_t num_inputs,
@@ -148,6 +199,21 @@ class ModelContainer : ModelContainerBase {
       int64_t** output_shapes_out);
 
   void SetConstant(const char* name, const AITData& tensor);
+  void SetManyConstants(
+      const char** names,
+      const AITData* tensors,
+      size_t num_tensors);
+
+  uint8_t* GetInactiveConstantsBuffer();
+  void SetDoubleBufferConstant(
+      const char* name,
+      const AITData& tensor,
+      StreamType stream = 0);
+  void SetManyDoubleBufferConstants(
+      const char** names,
+      const AITData* tensors,
+      size_t num_tensors,
+      StreamType stream = 0);
 
   size_t NumInputs() const;
   size_t NumOutputs() const;
@@ -155,15 +221,43 @@ class ModelContainer : ModelContainerBase {
   const char* InputName(size_t input_idx) const;
   const char* OutputName(size_t output_idx) const;
 
+  AITemplateParamShape MaxInputShape(size_t input_idx) const;
   AITemplateParamShape MaxOutputShape(size_t output_idx) const;
+
+  AITemplateDtype InputDtype(size_t input_idx) const;
   AITemplateDtype OutputDtype(size_t output_idx) const;
+
   size_t MaxOutputStorageBytes(size_t output_idx) const;
 
   size_t GetNumRuntimes() const {
     return models_.size();
   }
 
+  void FoldConstants(StreamType stream, bool sync, bool double_buffer = false);
+  void SwapConstants();
+
+  size_t GetNumConstants(bool unbound_constants_only = true) const;
+  size_t GetNumConstantFoldingInputs(bool unbound_constants_only = true) const;
+
+  // Write all constant names to the array pointed to by names_out.
+  // This function assumes that names_out has enough space to hold
+  // at least GetNumConstants() pointers. The strings written
+  // are guaranteed to live as long as their owning ModelContainer.
+  void WriteAllConstantNamesTo(
+      const char** names_out,
+      bool unbound_constants_only,
+      bool constant_folding_inputs_only) const;
+
  private:
+  void WaitForAllModels(bool include_constant_folder = false);
+  void FoldConstantsImpl(StreamType stream, bool double_buffer = false);
+  void SetConstantImpl(
+      const char* name,
+      const AITData& tensor,
+      bool use_secondary_buffer = false,
+      StreamType stream = 0);
+  void SwapConstantFolderBuffer();
+
   void PrepareForRun(
       Model* model,
       const AITData* inputs,
@@ -173,7 +267,8 @@ class ModelContainer : ModelContainerBase {
 
   Model* GetAvailableModel();
   void ReclaimFinishedModels(std::unique_lock<std::mutex>& lk);
-  void ValidateDtype(AITemplateDtype dtype, size_t idx) const;
+  void ValidateParamDtype(AITemplateDtype dtype, size_t idx) const;
+  void ValidateBoundConstantDtype(AITemplateDtype dtype, size_t idx) const;
 
   float BenchmarkImpl(
       const AITData* inputs,
@@ -187,7 +282,8 @@ class ModelContainer : ModelContainerBase {
 
   AITemplateAllocator& allocator_;
 
-  std::vector<Model> models_;
+  std::vector<std::unique_ptr<Model>> models_;
+  std::unique_ptr<ConstantFolder> constant_folder_;
   std::vector<Model*> available_models_;
   std::deque<Model*> pending_models_;
 
@@ -195,9 +291,28 @@ class ModelContainer : ModelContainerBase {
   std::mutex models_mutex_;
   // Notified whenever a model is put into pending_models_.
   std::condition_variable pending_models_available_;
+  // Prevents constant folding or SetConstants on main models from starting
+  // while there are ongoing inferences (and vice versa). FoldConstants() and
+  // SetConstants acquires in unique mode, Run()/Benchmark() acquire in shared
+  // mode.
+  //
+  // Since constants_sync_mutex_ is acquired in shared mode for the entire
+  // duration of Run()/Benchmark(), there is no need to acquire models_mutex_
+  // while constants_sync_mutex_ is acquired in unique mode.
+  // Why complicate things with two locks? The system is designed with the
+  // assumption that concurrent inferences are common. We don't want to acquire
+  // models_mutex_ uniquely for the entire duration of Run(), because that
+  // prevents concurrent inferences from happening while kernels are being
+  // queued.
+  std::shared_mutex constants_sync_mutex_;
+  // constants_double_buffer_mutex_ is separate from constants_sync_mutex since
+  // when we use double buffer, it won't affect the main model.
+  std::shared_mutex constants_double_buffer_mutex_;
 
   size_t num_inputs_;
   size_t num_outputs_;
+
+  bool constant_folded_once_ = false;
 };
 
 } // namespace ait
diff --git a/static/include/model_interface.h b/static/include/model_interface.h
index 2ec362535..6ef12a2da 100644
--- a/static/include/model_interface.h
+++ b/static/include/model_interface.h
@@ -43,6 +43,13 @@ enum class AITemplateError : int {
   AITemplateFailure = 1,
 };
 
+#define AIT_ERROR_CHECK(call)                                             \
+  if ((call) != AITemplateError::AITemplateSuccess) {                     \
+    throw std::runtime_error(                                             \
+        std::string(#call " API call failed at ") + __FILE__ + ", line" + \
+        std::to_string(__LINE__));                                        \
+  }
+
 struct AITemplateParamShape {
   AITemplateParamShape() : shape_data(nullptr), size(0) {}
   AITemplateParamShape(const int64_t* shape_data_in, size_t size_in)
@@ -53,7 +60,7 @@ struct AITemplateParamShape {
 
   size_t Numel() const {
     return std::accumulate(
-        shape_data, shape_data + size, 1, std::multiplies<int64_t>());
+        shape_data, shape_data + size, (int64_t)1, std::multiplies<int64_t>());
   }
 };
 
@@ -64,6 +71,7 @@ enum class AITemplateDtype {
   kInt,
   kLong,
   kBool,
+  kBFloat16,
 };
 
 struct AITData {
@@ -83,6 +91,7 @@ struct AITData {
 inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
   switch (dtype) {
     case AITemplateDtype::kHalf:
+    case AITemplateDtype::kBFloat16:
       return 2;
     case AITemplateDtype::kFloat:
       return 4;
@@ -95,6 +104,7 @@ inline size_t AITemplateDtypeSizeBytes(AITemplateDtype dtype) {
     case AITemplateDtype::kUnset:
       throw std::runtime_error("Unset dtype has no size!");
   }
+  throw std::runtime_error("dtype handling is not implemented!");
 }
 
 struct AITemplateStreamOpaque {};
@@ -140,6 +150,37 @@ AIT_EXPORT AITemplateError AITemplateModelContainerSetConstant(
     const char* name,
     const AITData* tensor);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyConstants(
+    AITemplateModelHandle handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetDoubleBufferConstant(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char* name,
+    const AITData* tensor);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerSetManyDoubleBufferConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    const char** names,
+    const AITData* tensors,
+    size_t num_tensors);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetNumConstants(
+    AITemplateModelHandle handle,
+    bool unbound_constants_only,
+    bool constant_folding_inputs_only,
+    size_t* num_constants_out);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetConstantNames(
+    AITemplateModelHandle handle,
+    bool unbound_constants_only,
+    bool constant_folding_inputs_only,
+    const char** constant_names_out);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerRun(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -153,8 +194,8 @@ AIT_EXPORT AITemplateError AITemplateModelContainerRun(
 
 // Like AITemplateModelContainerRun, but expects outputs to be allocated on the
 // host. Does an extra sync/copy at the end to copy them over. Warning: don't
-// use this! It's not optimal with respect to performance. It's here for use by
-// internal constant folding passes.
+// use this! It's not optimal with respect to performance. It's here for use if
+// you need it for debugging.
 AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
     AITemplateModelHandle handle,
     const AITData* inputs,
@@ -165,11 +206,22 @@ AIT_EXPORT AITemplateError AITemplateModelContainerRunWithOutputsOnHost(
     bool graph_mode,
     int64_t** output_shapes_out);
 
+/// Do per op profile and write the profiling report to file.
+AIT_EXPORT AITemplateError AITemplateModelContainerProfile(
+    AITemplateModelHandle handle,
+    const AITData* inputs,
+    size_t num_inputs,
+    AITData* outputs,
+    size_t num_outputs,
+    AITemplateStreamHandle stream_handle,
+    size_t num_iters,
+    const char* filename);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerBenchmark(
     AITemplateModelHandle handle,
     const AITData* inputs,
     size_t num_inputs,
-    AITData* ouputs,
+    AITData* outputs,
     size_t num_outputs,
     AITemplateStreamHandle stream_handle,
     bool graph_mode,
@@ -188,6 +240,16 @@ AIT_EXPORT AITemplateError AITemplateModelContainerGetInputName(
     size_t input_idx,
     const char** input_name_out);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerGetMaximumInputShape(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateParamShape* shape);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerGetInputDtype(
+    AITemplateModelHandle handle,
+    size_t input_idx,
+    AITemplateDtype* input_dtype);
+
 AIT_EXPORT AITemplateError AITemplateModelContainerGetNumOutputs(
     AITemplateModelHandle handle,
     size_t* num_outputs_out);
@@ -211,6 +273,19 @@ AIT_EXPORT AITemplateError AITemplateModelContainerGetNumRuntimes(
     AITemplateModelHandle handle,
     size_t* num_runtimes_out);
 
+AIT_EXPORT AITemplateError AITemplateModelContainerFoldConstants(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync);
+
+AIT_EXPORT AITemplateError AITemplateModelContainerFoldConstantsInDoubleBuffer(
+    AITemplateModelHandle handle,
+    AITemplateStreamHandle stream_handle,
+    bool sync);
+
+AIT_EXPORT AITemplateError
+AITemplateModelContainerSwapConstants(AITemplateModelHandle handle);
+
 AIT_EXPORT AITemplateError AITemplateAllocatorCreate(
     AITemplateAllocator** allocator_out,
     AITemplateAllocatorType allocator_type);
diff --git a/static/include/owned_constants.h b/static/include/owned_constants.h
index 64ceeea47..298f1a21b 100644
--- a/static/include/owned_constants.h
+++ b/static/include/owned_constants.h
@@ -42,5 +42,6 @@ struct ConstantInfo {
 // For information on the binary format, see `man objcopy`, under
 // the "binary-architecture" flag:
 // https://man7.org/linux/man-pages/man1/objcopy.1.html
+// todo: use #embed in C++ 23 once available
 extern const uint8_t _binary_constants_bin_start[];
 extern const uint8_t _binary_constants_bin_end[];
diff --git a/static/include/raii_wrapper.h b/static/include/raii_wrapper.h
index 24270d8b9..ce1667959 100644
--- a/static/include/raii_wrapper.h
+++ b/static/include/raii_wrapper.h
@@ -69,4 +69,17 @@ inline GraphPtr RAII_EndCaptureAndCreateGraph(
   return GraphPtr(graph, GraphDestroy);
 }
 
+class RAII_ProfilerRange {
+ public:
+  RAII_ProfilerRange(char* name) {
+    ProfilerRangePush(name);
+  }
+  ~RAII_ProfilerRange() {
+    ProfilerRangePop();
+  }
+
+  RAII_ProfilerRange(const RAII_ProfilerRange&) = delete;
+  RAII_ProfilerRange(RAII_ProfilerRange&&) = delete;
+};
+
 } // namespace ait
diff --git a/static/include/rocm_device_functions.h b/static/include/rocm_device_functions.h
index dced66911..799e693bb 100644
--- a/static/include/rocm_device_functions.h
+++ b/static/include/rocm_device_functions.h
@@ -21,13 +21,15 @@
 #include <cstdlib>
 #include <initializer_list>
 #include "include/ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "include/ck/utility/print.hpp"
+
 #include "library/include/ck/library/utility/device_memory.hpp"
 #include "library/include/ck/library/utility/host_tensor.hpp"
 #include "library/include/ck/library/utility/host_tensor_generator.hpp"
 
 namespace ait {
 
+inline thread_local bool target_has_graph_mode = true;
+
 using DeviceError = hipError_t;
 using DevicePropertyType = hipDeviceProp_t;
 using StreamType = hipStream_t;
@@ -46,6 +48,125 @@ inline DeviceError GetDeviceProperties(
   return hipGetDeviceProperties(prop, device_idx);
 }
 
+inline std::string PrintArchFeatureFlags(const hipDeviceArch_t& arch) {
+  std::ostringstream oss;
+  oss << "\n     Has 32-bit integer atomics for global memory: "
+      << (arch.hasGlobalInt32Atomics ? "yes" : "no")
+      << "\n     Has 32-bit float atomic exch for global memory: "
+      << (arch.hasGlobalFloatAtomicExch ? "yes" : "no")
+      << "\n     Has 32-bit integer atomics for shared memory: "
+      << (arch.hasSharedInt32Atomics ? "yes" : "no")
+      << "\n     Has 32-bit float atomic exch for shared memory: "
+      << (arch.hasSharedFloatAtomicExch ? "yes" : "no")
+      << "\n     Has 32-bit float atomic add in global and shared memory: "
+      << (arch.hasFloatAtomicAdd ? "yes" : "no")
+      << "\n     Has 64-bit integer atomics for global memory: "
+      << (arch.hasGlobalInt64Atomics ? "yes" : "no")
+      << "\n     Has 64-bit integer atomics for shared memory: "
+      << (arch.hasSharedInt64Atomics ? "yes" : "no")
+      << "\n     Has double-precision floating point: "
+      << (arch.hasDoubles ? "yes" : "no")
+      << "\n     Has warp vote instructions (__any, __all): "
+      << (arch.hasWarpVote ? "yes" : "no")
+      << "\n     Has warp ballot instructions (__ballot): "
+      << (arch.hasWarpBallot ? "yes" : "no")
+      << "\n     Has warp shuffle operations. (__shfl_*): "
+      << (arch.hasWarpShuffle ? "yes" : "no")
+      << "\n     Has funnel two words into one with shift&mask caps: "
+      << (arch.hasFunnelShift ? "yes" : "no")
+      << "\n     Has __threadfence_system: "
+      << (arch.hasThreadFenceSystem ? "yes" : "no")
+      << "\n     Has __syncthreads_count, syncthreads_and, syncthreads_or: "
+      << (arch.hasSyncThreadsExt ? "yes" : "no")
+      << "\n     Has surface functions: "
+      << (arch.hasSurfaceFuncs ? "yes" : "no")
+      << "\n     Grid and group dims are 3D (rather than 2D): "
+      << (arch.has3dGrid ? "yes" : "no")
+      << "\n     Has dynamic parallelism: "
+      << (arch.hasDynamicParallelism ? "yes" : "no");
+      return oss.str();
+}
+
+inline std::string PrintInfoDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     AMD GCN Arch Value: " << prop.gcnArchName
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Maximum Shared Memory Per Multiprocessor in bytes: "
+      << prop.maxSharedMemoryPerMultiProcessor;
+  return oss.str();
+}
+
+inline std::string PrintDebugDeviceProperties(const DevicePropertyType& prop) {
+  std::ostringstream oss;
+  oss << "Hardware accelerator device properties: "
+      << "\n  Device: "
+      << "\n     ASCII string identifying device: " << prop.name
+      << "\n     Major compute capability: " << prop.major
+      << "\n     Minor compute capability: " << prop.minor
+      << "\n     AMD GCN Arch Value: " << prop.gcnArchName
+      << "\n     PCI bus ID of the device: " << prop.pciBusID
+      << "\n     PCI device ID of the device: " << prop.pciDeviceID
+
+      << "\n  Memory limits: "
+      << "\n     Constant memory available on device in bytes: "
+      << prop.totalConstMem
+      << "\n     Global memory available on device in bytes: "
+      << prop.totalGlobalMem
+      << "\n     Global memory bus width in bits: " << prop.memoryBusWidth
+      << "\n     Size of L2 cache in bytes: " << prop.l2CacheSize
+      << "\n     Shared memory available per block in bytes: "
+      << prop.sharedMemPerBlock
+      << "\n     Maximum Shared Memory Per Multiprocessor in bytes: "
+      << prop.maxSharedMemoryPerMultiProcessor
+      << "\n     Max global memory clock frequency in khz: "
+      << prop.memoryClockRate
+      << "\n     Peak global memory bandwidth (GByte/s): "
+      << (prop.memoryClockRate / 1e6) * (prop.memoryBusWidth / 8) * 2
+
+      << "\n  Thread limits: "
+      << "\n     Warp size in threads: " << prop.warpSize
+      << "\n     Maximum size of each dimension of a grid: "
+      << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " "
+      << prop.maxGridSize[2]
+      << "\n     Maximum size of each dimension of a block: "
+      << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " "
+      << prop.maxThreadsDim[2] << "\n     Maximum number of threads per block: "
+      << prop.maxThreadsPerBlock
+      << "\n     Registers available per block: " << prop.regsPerBlock
+      << "\n     Number of multiprocessors on device: "
+      << prop.multiProcessorCount
+      << "\n     Maximum resident threads per multiprocessor: "
+      << prop.maxThreadsPerMultiProcessor
+      << "\n     Max clock frequency of the multiProcessors in khz: "
+      << prop.clockRate
+
+      << "\n  Device features: "
+      << "\n     Device can possibly execute multiple kernels concurrently: "
+      << (prop.concurrentKernels ? "yes" : "no")
+      << "\n     Device is on a multi-GPU board: "
+      << (prop.isMultiGpuBoard ? "yes" : "no")
+      << "\n     HIP can map host memory: "
+      << (prop.canMapHostMemory ? "yes" : "no")
+      << PrintArchFeatureFlags(prop.arch);
+
+  return oss.str();
+}
+
 inline DeviceError StreamCreate(StreamType* stream, bool non_blocking = false) {
   auto flags = non_blocking ? hipStreamNonBlocking : hipStreamDefault;
   return hipStreamCreateWithFlags(stream, flags);
@@ -65,6 +186,10 @@ inline DeviceError StreamDestroy(StreamType stream) {
   return hipStreamDestroy(stream);
 }
 
+inline DeviceError StreamWaitEvent(StreamType stream, EventType event) {
+  return hipStreamWaitEvent(stream, event, 0);
+}
+
 inline DeviceError GraphInstantiate(
     GraphExecType* graph_exec,
     GraphType graph) {
@@ -77,7 +202,8 @@ inline DeviceError GraphDestroy(GraphType graph) {
 
 inline DeviceError GraphExecUpdate(GraphExecType graph_exec, GraphType graph) {
   // We don't have hipGraphExecUpdate in some versions of rocm
-  return hipErrorUnknown;
+  hipGraphExecUpdateResult update;
+  return hipGraphExecUpdate(graph_exec, graph, nullptr, &update);
 }
 
 inline DeviceError GraphExecDestroy(GraphExecType graph_exec) {
@@ -118,6 +244,10 @@ inline DeviceError FreeDeviceMemory(Handle src) {
   return hipFree(src);
 }
 
+inline DeviceError FreeDeviceHostMemory(Handle src) {
+  return hipHostFree(src);
+}
+
 inline DeviceError FreeDeviceMemoryAsync(
     Handle src,
     StreamType /*stream*/ = 0) {
@@ -129,6 +259,10 @@ inline DeviceError DeviceMalloc(Handle* dst, size_t size) {
   return hipMalloc(dst, size);
 }
 
+inline DeviceError DeviceMallocHost(Handle* dst, size_t size) {
+  return hipHostMalloc(dst, size, hipHostMallocDefault);
+}
+
 inline DeviceError DeviceMallocAsync(
     Handle* dst,
     size_t size,
@@ -157,7 +291,7 @@ inline DeviceError StreamSynchronize(StreamType stream) {
   return hipStreamSynchronize(stream);
 }
 
-inline DeviceError CreateEvent(EventType* event) {
+inline DeviceError CreateEvent(EventType* event, bool measure_time = true) {
   return hipEventCreate(event);
 }
 
@@ -181,7 +315,7 @@ inline DeviceError QueryEvent(EventType event) {
   return hipEventQuery(event);
 }
 
-inline const char* GetErrorString(DeviceError err) {
+inline std::string GetErrorString(DeviceError err) {
   return hipGetErrorString(err);
 }
 
@@ -189,4 +323,21 @@ inline DeviceError GetDeviceNotReady() {
   return hipErrorNotReady;
 }
 
+inline DeviceError GetDriverVersion(int* driverVersion) {
+  return hipDriverGetVersion(driverVersion);
+}
+
+inline DeviceError GetRuntimeVersion(int* runtimeVersion) {
+  return hipRuntimeGetVersion(runtimeVersion);
+}
+
+inline void ProfilerRangePush(const char* msg) {
+  // TODO: Activate roctx header and linkage
+  // roctxRangePush(msg);
+}
+
+inline void ProfilerRangePop() {
+  // TODO: Activate roctx header and linkage
+  // roctxRangePop();
+}
 } // namespace ait
diff --git a/static/include/windll.h b/static/include/windll.h
new file mode 100644
index 000000000..ecb73d2a2
--- /dev/null
+++ b/static/include/windll.h
@@ -0,0 +1,25 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//
+
+#pragma once
+
+#include <cstdint>
+
+namespace ait {
+
+// throws std::runtime_error in case of problems
+void GetConstantsBin(void** address, size_t* size);
+
+} // namespace ait
diff --git a/tests/ci_profile_cache/update_cache.py b/tests/ci_profile_cache/update_cache.py
index 8011b7546..3ac4e280e 100644
--- a/tests/ci_profile_cache/update_cache.py
+++ b/tests/ci_profile_cache/update_cache.py
@@ -28,8 +28,10 @@
 import jinja2
 from aitemplate.backend.profiler_cache import GEMM_INSERT_TEMPLATE, GEMM_QUERY_TEMPLATE
 
+
 logging.basicConfig(format="%(name)s: %(message)s", level=logging.INFO)
-logger = logging.getLogger("update-cache")
+
+_LOGGER = logging.getLogger("update-cache")
 
 DEFAULT_QUERY_TEMPLATE = jinja2.Template(
     """
@@ -127,7 +129,7 @@ def del_entry(
     Returns
     ----------
     """
-    logger.info("query entry for deletion - id: %s, op_type: %s", entry_id, op_type)
+    _LOGGER.info("query entry for deletion - id: %s, op_type: %s", entry_id, op_type)
     db_conn_cur = db_conn.cursor()
     entries = query_cache(
         db_conn_cur=db_conn_cur,
@@ -140,7 +142,7 @@ def del_entry(
         entry_id=entry_id,
     )
     if len(entries) == 0:
-        logger.info("Could not find valid entries, skip")
+        _LOGGER.info("Could not find valid entries, skip")
         return
 
     assert len(entries) == 1
@@ -148,14 +150,14 @@ def del_entry(
         raise RuntimeError(
             f"cannot delete the entry, unmatched op_type: {op_type}, {entries[0][1]}"
         )
-    logger.info("deleting entry - id: %s, op_type: %s", entry_id, op_type)
+    _LOGGER.info("deleting entry - id: %s, op_type: %s", entry_id, op_type)
     del_query = DEL_ID_TEMPLATE.render(
         table=table,
         id=entry_id,
     )
     db_conn_cur.execute(del_query)
     db_conn.commit()
-    logger.info("entry deleted successfully")
+    _LOGGER.info("entry deleted successfully")
 
 
 def insert_sm75_entry(
@@ -200,7 +202,7 @@ def insert_sm75_entry(
     args["device"] = "75"
 
     new_args_str = "\n".join(["{}: {}".format(n, v) for n, v in args.items()])
-    logger.info("new_args:\n%s", new_args_str)
+    _LOGGER.info("new_args:\n%s", new_args_str)
     query_sql = GEMM_QUERY_TEMPLATE.render(
         dev="cuda",
         dtype_a=args["dtype_a"],
@@ -238,7 +240,7 @@ def insert_sm75_entry(
     insertion_sql = GEMM_INSERT_TEMPLATE.render(dev="cuda", **args)
     db_conn_cur.execute(insertion_sql)
     db_conn.commit()
-    logger.info(
+    _LOGGER.info(
         "successfully insert an sm75 entry for: '%s', '%s'",
         args["op_type"],
         args["exec_entry"],
@@ -389,10 +391,10 @@ def query_cache(
 
     if exec_key is not None:
         if not suppress_print:
-            logger.info("exec_key: '%s'", exec_key)
+            _LOGGER.info("exec_key: '%s'", exec_key)
         exec_entry_sha1 = hashlib.sha1(exec_key.encode("utf-8")).hexdigest()
         if not suppress_print:
-            logger.info("exec_sha1: '%s'", exec_entry_sha1)
+            _LOGGER.info("exec_sha1: '%s'", exec_entry_sha1)
 
     query_args = {
         "table": table,
@@ -430,7 +432,7 @@ def query_cache(
     db_conn_cur.execute(query)
     entries = db_conn_cur.fetchall()
     if not suppress_print:
-        logger.info("entries: id, op_type, algo, device, exec_entry")
+        _LOGGER.info("entries: id, op_type, algo, device, exec_entry")
     for entry in entries:
         if not suppress_print:
             print("entry: {}".format(entry))
@@ -459,7 +461,7 @@ def process_missing_75_entries_from_80(
     Returns
     ----------
     """
-    logger.info("query all missing sm75 entries - op_type: %s", op_type)
+    _LOGGER.info("query all missing sm75 entries - op_type: %s", op_type)
     db_conn_cur = db_conn.cursor()
     if op_type == "all":
         op_type = None
@@ -500,7 +502,7 @@ def process_missing_75_entries_from_80(
         if len(sm75_entries) == 0:
             print("missing sm75 entry for this sm80 entry: '{}'".format(sm80_entry))
             if gen_sm75_entry:
-                logger.info("gen sm75 entry for: '%s'", sm80_entry)
+                _LOGGER.info("gen sm75 entry for: '%s'", sm80_entry)
                 column_names = get_column_names(db_conn_cur, table)
                 insert_sm75_entry(db_conn, table, sm80_entry, column_names, None)
 
@@ -586,7 +588,7 @@ def make_75_algo_from_80(old_algo: str):
     else:
         raise RuntimeError("Invalid old_algo format: '{}'".format(old_algo))
 
-    logger.info("new_algo: '%s'", new_algo)
+    _LOGGER.info("new_algo: '%s'", new_algo)
     return new_algo
 
 
@@ -594,7 +596,7 @@ def get_column_names(db_conn_cur: sqlite3.Cursor, table: str):
     column_names_query = QUERY_COLUMN_NAMES_TEMPLATE.render(table=table)
     columns = db_conn_cur.execute(column_names_query)
     column_names = [col[0] for col in columns.description]
-    logger.info("colum_names:%s", column_names)
+    _LOGGER.info("colum_names:%s", column_names)
     return column_names
 
 
@@ -638,7 +640,7 @@ def gen_one_75_entry_from_80(
         device="80",
     )
     if len(entries) == 0:
-        logger.info("Could not find valid entries, skip")
+        _LOGGER.info("Could not find valid entries, skip")
         return
 
     column_names = get_column_names(db_conn_cur, table)
diff --git a/tests/lint/check_meta_header.py b/tests/lint/check_meta_header.py
index dd69d74fb..da385fa58 100644
--- a/tests/lint/check_meta_header.py
+++ b/tests/lint/check_meta_header.py
@@ -19,6 +19,7 @@
 
 import os
 import sys
+from typing import List
 
 import click
 
@@ -36,7 +37,7 @@ def process_header(header, comment):
 CPP_HEADER = process_header(HEADER, "//")
 
 
-def dfs(root_path: str) -> list[str]:
+def dfs(root_path: str) -> List[str]:
     """DFS source code tree to find python files missing header
 
     Parameters
@@ -46,7 +47,7 @@ def dfs(root_path: str) -> list[str]:
 
     Returns
     -------
-    list[str]
+    List[str]
         file list missing header
     """
     ret = []
@@ -66,12 +67,12 @@ def dfs(root_path: str) -> list[str]:
     return ret
 
 
-def fix_header(file_list: list[str]) -> None:
+def fix_header(file_list: List[str]) -> None:
     """Adding Meta header to to source files
 
     Parameters
     ----------
-    file_list : list[str]
+    file_list : List[str]
         file list missing header
     """
     for path in file_list:
diff --git a/tests/unittest/backend/test_build_cache.py b/tests/unittest/backend/test_build_cache.py
new file mode 100644
index 000000000..921be28ba
--- /dev/null
+++ b/tests/unittest/backend/test_build_cache.py
@@ -0,0 +1,319 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import shutil
+import tempfile
+import unittest
+from pathlib import Path
+from unittest.mock import patch
+
+import torch
+
+from aitemplate.backend.build_cache_base import (
+    create_dir_hash,
+    FileBasedBuildCache,
+    is_source,
+    SkipBuildCache,
+)
+
+from aitemplate.backend.cuda.target_def import FBCUDA
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.io import file_age
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class BuildCacheTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _create_model_graph(self):
+        dtype = "float32"
+        X1 = Tensor(
+            shape=[IntImm(1), IntImm(10)],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+        Y = ops.expand()(X1, shape=(10, 10))
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        return Y
+
+    def _create_model_graph2(self):
+        dtype = "float32"
+        Z1 = Tensor(
+            shape=[IntImm(10), IntImm(1)],
+            dtype=dtype,
+            name="Z1",
+            is_input=True,
+        )
+        Y = ops.expand()(Z1, shape=(10, 10))
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        return Y
+
+    def test_file_build_cache(self):
+        with patch(
+            "aitemplate.backend.build_cache_base.should_skip_build_cache"
+        ) as should_skip_build_cache_mock:
+            should_skip_build_cache_mock.return_value = False
+            with tempfile.TemporaryDirectory() as parent_dir:
+                cache_dir = os.path.join(parent_dir, "build_cache")
+                shutil.rmtree(cache_dir, ignore_errors=True)
+                cache = FileBasedBuildCache(
+                    cache_dir,
+                    lru_retention_hours=0,
+                    cleanup_max_age_seconds=1000,
+                    debug=True,
+                )
+                cache.maybe_cleanup()
+                assert os.path.exists(cache_dir + "/.last_cleaned")
+                assert (
+                    file_age(cache_dir + "/.last_cleaned") < 10.0
+                ), "Last clean time should  than 10 seconds"
+
+                build_dir_1 = os.path.join(parent_dir, "build_1")
+                build_dir_2 = os.path.join(parent_dir, "build_2")
+
+                os.makedirs(build_dir_1, exist_ok=False)
+                os.makedirs(build_dir_2, exist_ok=False)
+                for build_dir in [build_dir_1, build_dir_2]:
+                    bp = Path(build_dir)
+                    (bp / "Makefile").write_text("test.exe: test.cu")
+                    (bp / "test.cu").write_text("printf('Hello, World!');")
+                assert create_dir_hash(
+                    [f"make {build_dir_1}"], build_dir_1
+                ) == create_dir_hash([f"make {build_dir_2}"], build_dir_2)
+                found_entry1, cache_key1 = cache.retrieve_build_cache(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                found_entry2, cache_key2 = cache.retrieve_build_cache(
+                    [f"make {build_dir_2}"], build_dir_2
+                )
+                assert not found_entry1
+                assert not found_entry2
+                assert cache_key1 == cache_key2
+                assert cache_key1 == create_dir_hash(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                (Path(build_dir_2) / "test.so").write_bytes("ELF1234".encode("ascii"))
+                cache.store_build_cache(
+                    [f"make {build_dir_2}"], build_dir_2, cache_key2
+                )
+                assert os.path.exists(os.path.join(cache_dir, cache_key2))
+                found_entry1, cache_key1 = cache.retrieve_build_cache(
+                    [f"make {build_dir_1}"], build_dir_1
+                )
+                assert os.path.exists(os.path.join(build_dir_1, "test.so"))
+                assert (
+                    Path(os.path.join(build_dir_1, "test.so")).read_bytes()
+                    == Path(os.path.join(build_dir_2, "test.so")).read_bytes()
+                )
+
+    def test_deterministic_codegen(self, dtype="float32"):
+        with SkipBuildCache():
+            # Tests, whether repeated invocation of compilation results in identical generated source files
+            test_name = "test_deterministic_codegen"
+            basepath = "./tmp"
+
+            # Clean previous test results. These are usually kept for debugging purposes
+            # but we need a clean slate here.
+            if os.path.exists(basepath):
+                existing_dirs = [
+                    d
+                    for d in os.listdir(basepath)
+                    if d.startswith(test_name)
+                    and os.path.isdir(os.path.join(basepath, d))
+                ]
+                for d in existing_dirs:
+                    oldpath = os.path.join(basepath, d)
+                    if os.path.exists(oldpath) and test_name in oldpath:
+                        shutil.rmtree(oldpath)
+            else:
+                os.mkdir(basepath)
+
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            dll_name = "test.so"
+            build_dir = os.path.join("./tmp", test_name)
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_1",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash1 = create_dir_hash(
+                ["test_name"], build_dir + "_1", is_source, debug=True
+            )
+            cache_key_log_1 = (Path(build_dir + "_1") / "cache_key.log").read_text()
+            Y = self._create_model_graph()
+            target = detect_target()
+            # Variant 2: Clean build
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_2",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash2 = create_dir_hash(
+                ["test_name"], build_dir + "_2", is_source, debug=True
+            )
+            cache_key_log_2 = (Path(build_dir + "_2") / "cache_key.log").read_text()
+
+            assert (
+                hash1 == hash2
+            ), f"Code generation was not deterministic. Cache key mismatch between first and second code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)\nLOG 1:\n{cache_key_log_1}\n------\nLOG 1:\n{cache_key_log_2}\n-----"
+            # Variant 3: Build over existing build dir
+            Y = self._create_model_graph()
+            target = detect_target()
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_2",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash3 = create_dir_hash(
+                ["test_name"], build_dir + "_2", is_source, debug=True
+            )
+            assert (
+                hash2 == hash3
+            ), "Code generation was not deterministic. Cache key mismatch between second and third code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+            # Variant 4: Let's provoke to copy the includes again, maybe to a new path?
+            Y = self._create_model_graph()
+            FBCUDA.cutlass_path_ = None
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_4",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash4 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+
+            assert (
+                hash3 == hash4
+            ), "Code generation was not deterministic. Cache key mismatch between third and fourth code generation pass. Hint: Debug this with the help of the debug option of function create_dir_hash(...)"
+
+            with open(
+                os.path.join(build_dir + "_4", "Makefile"), "a", encoding="utf-8"
+            ) as f:
+                f.write("\n")
+
+            hash5 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash4 != hash5
+            ), "Directory hash was not sensitive to a change in the Makefile, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+            with open(
+                os.path.join(build_dir + "_4", "anything.cu"), "w", encoding="utf-8"
+            ) as f:
+                f.write("// Nothing, really\n")
+
+            hash6 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash6 != hash5
+            ), "Directory hash was not sensitive to a change in a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+            os.rename(
+                os.path.join(build_dir + "_4", "anything.cu"),
+                os.path.join(build_dir + "_4", "anything_.cu"),
+            )
+            hash7 = create_dir_hash(
+                ["test_name"], build_dir + "_4", is_source, debug=True
+            )
+            assert (
+                hash7 != hash6
+            ), "Directory hash was not sensitive to a change of name of a source file, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=True)
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name + "_8",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            hash8 = create_dir_hash(
+                ["test_name"], build_dir + "_8", is_source, debug=True
+            )
+
+            assert (
+                hash8 != hash1
+            ), "Directory hash was not sensitive to a change of Makefile (standalone codegen) and possibly source code, the hashes should be different. Hint: Debug this with the help of the debug option of function create_dir_hash"
+
+    def test_repeated_build_dir_usage(self):
+        with tempfile.TemporaryDirectory() as tempdir:
+            Y = self._create_model_graph()
+            target = detect_target()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            dll_name = "test.so"
+            build_dir = Path(tempdir) / "build_dir"
+            compile_model(
+                Y,
+                target,
+                tempdir,
+                "build_dir",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            test_so_path = build_dir / "test.so"
+            assert test_so_path.exists()
+            test_so_content = test_so_path.read_bytes()
+
+            # Compile a slightly different model into the same directory
+            Y2 = self._create_model_graph2()
+            debug_settings = AITDebugSettings(gen_standalone=False)
+            compile_model(
+                Y2,
+                target,
+                tempdir,
+                "build_dir",
+                dll_name=dll_name,
+                debug_settings=debug_settings,
+            )
+            assert test_so_path.exists()
+            test_so_new_content = test_so_path.read_bytes()
+            self.assertNotEqual(
+                test_so_content,
+                test_so_new_content,
+                "The test.so should have been overwritten and different. Maybe a build cache was used and did not overwrite the file properly?",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/backend/test_cuda_graph.py b/tests/unittest/backend/test_cuda_graph.py
index 70d57f3f3..2f24b3b1b 100644
--- a/tests/unittest/backend/test_cuda_graph.py
+++ b/tests/unittest/backend/test_cuda_graph.py
@@ -24,13 +24,14 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class CUDAGraphTestCase(unittest.TestCase):
     def test_cuda_graph_multiple_runs(self):
-        logger.info("testing cuda graph with multiple runs")
+        _LOGGER.info("testing cuda graph with multiple runs")
         X0_batch_dim = IntVar([1, 65], name="batch_size")
         X0_non_batch_shape = [1, 772]
         X0_non_batch_dims = [IntImm(d) for d in X0_non_batch_shape]
@@ -54,7 +55,7 @@ def test_cuda_graph_multiple_runs(self):
         run = 2
         repeat = 1
         for b_size in [1, 65]:
-            logger.info(f"batch size = {b_size}")
+            _LOGGER.info(f"batch size = {b_size}")
             X0_shape = [b_size] + X0_non_batch_shape
             x0_pt = torch.randn(*X0_shape).cuda().half()
             x1_pt = torch.randn(*X1_shape).cuda().half()
diff --git a/tests/unittest/backend/test_fused_elementwise_backend.py b/tests/unittest/backend/test_fused_elementwise_backend.py
index 1e7bf3416..978522e46 100644
--- a/tests/unittest/backend/test_fused_elementwise_backend.py
+++ b/tests/unittest/backend/test_fused_elementwise_backend.py
@@ -59,7 +59,8 @@ def test_unary(self):
             output_accessors=[TensorAccessor(X3)],
             original_inputs=[X1],
             original_outputs=[X3],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4", "uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -134,7 +135,8 @@ def test_multi_inputs(self):
             output_accessors=[TensorAccessor(X6)],
             original_inputs=[X1, X2, X4],
             original_outputs=[X6],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4", "uint4", "uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -207,7 +209,8 @@ def test_constant(self):
             output_accessors=[TensorAccessor(X5)],
             original_inputs=[X1],
             original_outputs=[X5],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half2",
             data_t="half",
             input_broadcast_sizes=None,
@@ -275,7 +278,8 @@ def test_converter(self):
             output_accessors=[TensorAccessor(X5)],
             original_inputs=[X1],
             original_outputs=[X5],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half",
             data_t="half",
             input_broadcast_sizes=None,
@@ -360,7 +364,8 @@ def test_multi_outputs(self):
             output_accessors=[TensorAccessor(X6), TensorAccessor(X7)],
             original_inputs=[X1],
             original_outputs=[X6, X7],
-            read_t="uint4",
+            max_read_t="uint4",
+            read_types=["uint4"],
             op_t="half",
             data_t="half",
             input_broadcast_sizes=None,
diff --git a/tests/unittest/backend/test_gen_standalone.py b/tests/unittest/backend/test_gen_standalone.py
new file mode 100644
index 000000000..61ec8dc47
--- /dev/null
+++ b/tests/unittest/backend/test_gen_standalone.py
@@ -0,0 +1,177 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import os
+import re
+import subprocess
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.misc import is_windows
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class StridedOpCatPatternTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_gen_standalone(self, test_name, dtype):
+        M = 8
+        N = 16
+        K = 32
+        X1 = Tensor(
+            shape=[IntImm(M), IntImm(K)],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="W1",
+            is_input=True,
+        )
+        B1 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="B1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[IntImm(M), IntImm(N)],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[IntImm(M), IntImm(N)],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+        Y1 = ops.gemm_rcr_bias()(X1, W1, B1)
+        Y2 = ops.elementwise(FuncEnum.ADD)(Y1, X2)
+        cat_dim = 1
+        Y = ops.concatenate()([X3, Y2], dim=cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        debug_settings = AITDebugSettings(gen_standalone=True)
+        dll_name = "test.so"
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dll_name=dll_name,
+            debug_settings=debug_settings,
+        )
+
+        x1_pt = get_random_torch_tensor([M, K], dtype)
+        w1_pt = get_random_torch_tensor([N, K], dtype)
+        b1_pt = get_random_torch_tensor([N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
+        x3_pt = get_random_torch_tensor([M, N], dtype)
+
+        y1_pt = torch.nn.functional.linear(x1_pt, w1_pt, b1_pt)
+        y2_pt = y1_pt + x2_pt
+        y_pt = torch.cat([x3_pt, y2_pt], dim=cat_dim)
+        y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+        module.run_with_tensors(
+            {
+                "X1": x1_pt,
+                "W1": w1_pt,
+                "B1": b1_pt,
+                "X2": x2_pt,
+                "X3": x3_pt,
+            },
+            [y],
+        )
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+        # Now we run the generated executable
+        cwd = os.getcwd()
+        workdir = os.path.join(cwd, "tmp", test_name)
+        working_env = os.environ.copy()
+        if "LD_LIBRARY_PATH" in working_env:
+            working_env["LD_LIBRARY_PATH"] = (
+                working_env["LD_LIBRARY_PATH"] + ":" + workdir
+            )
+        else:
+            working_env["LD_LIBRARY_PATH"] = workdir
+        _LOGGER.info(f"work dir: {workdir}")
+        exe_name = "test.exe" if is_windows() else "./test"
+        with subprocess.Popen(
+            [exe_name],
+            shell=True,
+            cwd=workdir,
+            env=working_env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        ) as proc:
+            try:
+                timeout = 10
+                out, err = proc.communicate(timeout)
+            except subprocess.TimeoutExpired as e:
+                proc.kill()
+                out, err = proc.communicate()
+                raise e
+            finally:
+                stdout = out.decode()
+                stderr = err.decode()
+                if proc.returncode != 0:
+                    _LOGGER.info(f"stdout:\n\n{stdout}")
+                    _LOGGER.info(f"stderr:\n\n{stderr}")
+                    raise RuntimeError(f"failed to execute {exe_name}")
+                else:
+                    _LOGGER.info(f"stdout:\n\n{stdout}")
+                    all_output_lines = stdout.split("\n")
+                    output_lines = [
+                        line for line in all_output_lines if "output_0" in line
+                    ]
+                    self.assertTrue(len(output_lines) == 1)
+                    m = re.search("with shape: +([0-9,]+)", output_lines[0])
+                    self.assertTrue(m is not None)
+                    shape = m.group(1).split(",")
+                    self.assertTrue(int(shape[0]) == 8)
+                    self.assertTrue(int(shape[1]) == 32)
+
+    def test_gen_standalone_f16(self):
+        self._test_gen_standalone("gen_standalone_f16", "float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gen_standalone_f32_sm80(self):
+        self._test_gen_standalone("gen_standalone_f32", "float32")
+
+
+filter_test_cases_by_test_env(StridedOpCatPatternTestCase)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/backend/test_model_api.py b/tests/unittest/backend/test_model_api.py
index d7e1fb3ef..b99733738 100644
--- a/tests/unittest/backend/test_model_api.py
+++ b/tests/unittest/backend/test_model_api.py
@@ -15,6 +15,9 @@
 import contextlib
 import ctypes
 import itertools
+import json
+import os
+import tempfile
 import unittest
 from typing import Callable, Optional, Tuple
 
@@ -25,6 +28,7 @@
 from aitemplate.compiler import AIT_DEFAULT_NUM_RUNTIMES, compile_model, ops
 from aitemplate.compiler.base import (
     _ConstantTensorData,
+    _create_host_zero_tensor,
     _HostConstantTensorData,
     _NumpyConstantTensorData,
     _TorchConstantTensorData,
@@ -41,6 +45,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ModelAPITestCase(unittest.TestCase):
@@ -507,6 +512,27 @@ def test_benchmark(self):
         self.assertEqual(len(tensors), 1)
         self.assertTrue(torch.equal(tensors["output"], in0 * in1))
 
+    def test_profile(self):
+        module, (in0, in1), (out_pt, out_ait) = self._get_simple_graph_and_output(
+            "test_profile", False, True
+        )
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            profile_name = os.path.join(tmpdirname, "profile.json")
+            module.profile(
+                [
+                    torch_to_ait_data(in0),
+                    torch_to_ait_data(in1),
+                ],
+                [torch_to_ait_data(out_ait)],
+                20,
+                profile_name,
+            )
+            with open(profile_name) as f:
+                report = json.load(f)
+                self.assertTrue(len(report), 1)
+                for _, elapsed in report.items():
+                    self.assertGreater(elapsed["ms_per_iter"], 0)
+
     def test_get_output_dtype(self):
         module, inputs, output_np = self._get_simple_graph_and_output(
             "test_get_param_dtype"
@@ -866,7 +892,7 @@ def test_use_internal_constant_tensors_host(self):
     def test_use_internal_constant_tensors_gpu(self):
         self._test_use_constant_tensor(
             lambda tensor: _TorchConstantTensorData(tensor),
-            "test_use_internal_constant_tensors_host",
+            "test_use_internal_constant_tensors_gpu",
         )
 
     def test_use_internal_constant_tensors_huge(self):
@@ -1115,46 +1141,6 @@ def test_error_duplicate_output_in_output_tensors_list(self):
             "test_error_duplicate_output_in_output_tensors_list",
         )
 
-    def test_run_with_outputs_on_host(self):
-        (
-            module,
-            (in0_pt, in1_pt),
-            (out_pt, out_storage),
-        ) = self._get_simple_graph_and_output("test_run_with_outputs_on_host")
-        out_host = out_storage.cpu()
-        out_pt_host = out_pt.cpu()
-        module._run_with_outputs_on_host(
-            [
-                torch_to_ait_data(in0_pt),
-                torch_to_ait_data(in1_pt),
-            ],
-            [torch_to_ait_data(out_host)],
-        )
-
-        self.assertTrue(torch.equal(out_pt_host, out_host))
-        out_host.zero_()
-
-        module._run_with_tensors_outputs_on_host(
-            {"input_0": in0_pt, "input_1": in1_pt}, {"output": out_host}
-        )
-        self.assertTrue(torch.equal(out_pt_host, out_host))
-
-    def test_run_with_outputs_on_host_fails_with_outputs_on_device(self):
-        (
-            module,
-            (in0_pt, in1_pt),
-            (_, out_storage),
-        ) = self._get_simple_graph_and_output(
-            "test_run_with_outputs_on_host_fails_with_outputs_on_device"
-        )
-
-        self.assertRaises(
-            ValueError,
-            module._run_with_tensors_outputs_on_host,
-            {"input_0": in0_pt, "input_1": in1_pt},
-            {"output": out_storage},
-        )
-
     def test_cannot_use_closed_model(self):
         (
             module,
@@ -1215,10 +1201,12 @@ def test_run_fails_with_unbound_constants(self):
         self.assertTrue(torch.allclose(output_data, expected))
 
     def test_set_constant_fails_wrong_dtype(self):
-        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
-        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
-        output._attrs["name"] = "output"
-        output._attrs["is_output"] = True
+        def _create_graph():
+            constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+            output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+            output._attrs["name"] = "output"
+            output._attrs["is_output"] = True
+            return output
 
         for wrong_tensor in (
             torch.zeros([1, 2]).long().cuda(),
@@ -1227,7 +1215,7 @@ def test_set_constant_fails_wrong_dtype(self):
         ):
             target = detect_target()
             with compile_model(
-                output, target, "./tmp", "test_set_constant_fails_wrong_dtype"
+                _create_graph(), target, "./tmp", "test_set_constant_fails_wrong_dtype"
             ) as module:
                 self.assertRaises(
                     RuntimeError,
@@ -1237,10 +1225,12 @@ def test_set_constant_fails_wrong_dtype(self):
                 )
 
     def test_set_constant_fails_wrong_shape(self):
-        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
-        output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
-        output._attrs["name"] = "output"
-        output._attrs["is_output"] = True
+        def _create_graph():
+            constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+            output = ops.elementwise(FuncEnum.MUL)(constant_1, constant_1)
+            output._attrs["name"] = "output"
+            output._attrs["is_output"] = True
+            return output
 
         for wrong_shape in (
             [2, 2],
@@ -1249,6 +1239,7 @@ def test_set_constant_fails_wrong_shape(self):
         ):
             wrong_tensor = torch.randn(wrong_shape).half().cuda()
             target = detect_target()
+            output = _create_graph()
             with compile_model(
                 output, target, "./tmp", "test_set_constant_fails_wrong_shape"
             ) as module:
@@ -1421,7 +1412,7 @@ def test_custom_allocator(self):
                 f"test_custom_allocator_{allocator_kind.value}",
                 allocator_kind=AITemplateAllocatorKind.TRACKING,
             ) as module:
-                allocator = module.DLL.allocator_handle
+                allocator = module.allocator_handle
                 self.assertIsNotNone(allocator.value)
 
                 if allocator_kind == AITemplateAllocatorKind.TRACKING:
@@ -1445,6 +1436,161 @@ def test_custom_allocator(self):
                 module.run_with_tensors([x_pt], [z_ait])
                 self.assertTrue(z_ait.equal(z_pt))
 
+    def test_get_constant_names(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_3 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constants = {}
+
+        # constant 0 and constant 1 are not folded.
+        # constant 0 is unbounded, constant 1 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, x, constant_1])
+        constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+
+        # constants 2 and 3 and 4 are folded.
+        # constants 2 and 4 are unbounded, constants 3 is bounded.
+        y = ops.concatenate()([constant_2, constant_3, constant_4])
+        constants["constant_3"] = get_random_torch_tensor((1, 2), "float16")
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output, target, "./tmp", "test_get_constant_names", constants=constants
+        )
+
+        names_0 = module.get_constant_names(
+            unbound_constants_only=True, constant_folding_only=False
+        )
+        self.assertEqual(set(names_0), {"constant_0", "constant_2", "constant_4"})
+
+        names_1 = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=False
+        )
+        self.assertEqual(
+            set(names_1),
+            {"constant_0", "constant_1", "constant_2", "constant_3", "constant_4"},
+        )
+
+        names_2 = module.get_constant_names(
+            unbound_constants_only=True, constant_folding_only=True
+        )
+        self.assertEqual(set(names_2), {"constant_2", "constant_4"})
+
+        names_3 = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=True
+        )
+        self.assertEqual(set(names_3), {"constant_2", "constant_3", "constant_4"})
+
+        names_4 = module.get_constant_folding_input_names(unbound_constants_only=True)
+        self.assertEqual(set(names_4), {"constant_2", "constant_4"})
+
+        names_5 = module.get_constant_folding_input_names(unbound_constants_only=False)
+        self.assertEqual(set(names_5), {"constant_2", "constant_3", "constant_4"})
+
+    def test_get_constant_names_with_ait_generated(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_3 = _create_host_zero_tensor(
+            shape=[1, 2], name="constant_3", dtype="float16"
+        )
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constants = {}
+
+        # constant 0 and constant 1 are not folded.
+        # constant 0 is unbounded, constant 1 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, x, constant_1])
+        constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+
+        # constants 2 and 3 and 4 are folded.
+        # constants 2 and 4 are unbounded, constants 3 is bounded.
+        y = ops.concatenate()([constant_2, constant_3, constant_4])
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(
+            output,
+            target,
+            "./tmp",
+            "test_get_constant_names_with_ait_generated",
+            constants=constants,
+        )
+
+        names = module.get_constant_names(
+            unbound_constants_only=False, constant_folding_only=False
+        )
+        self.assertEqual(
+            set(names),
+            {"constant_0", "constant_1", "constant_2", "constant_4"},
+        )
+
+    def test_set_many_constants(self):
+        target = detect_target()
+
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_get_constant_names")
+
+        input_0_pt = torch.randn((1, 2)).cuda().half()
+        constant_1_pt = torch.randn((1, 2)).cuda().half()
+        constant_2_pt = torch.randn((1, 2)).cuda().half()
+        module.set_many_constants_with_tensors(
+            {"constant_1": constant_1_pt, "constant_2": constant_2_pt}
+        )
+        output_pt = input_0_pt * constant_1_pt * constant_2_pt
+        output_ait = torch.empty_like(input_0_pt)
+        module.run_with_tensors([input_0_pt], [output_ait])
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
+    def test_async_fold_constants(self):
+        target = detect_target()
+
+        input_0 = Tensor(
+            shape=[10000, 2000], dtype="float16", name="input_0", is_input=True
+        )
+        constant_1 = Tensor(shape=[10000, 2000], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[10000, 2000], dtype="float16", name="constant_2")
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_1)
+        output = ops.elementwise(FuncEnum.MUL)(x, constant_2)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        module = compile_model(output, target, "./tmp", "test_get_constant_names")
+
+        input_0_pt = torch.randn((10000, 2000)).cuda().half()
+        constant_1_pt = torch.randn((10000, 2000)).cuda().half()
+        constant_2_pt = torch.randn((10000, 2000)).cuda().half()
+        output_pt = input_0_pt * constant_1_pt * constant_2_pt
+        output_ait = torch.empty_like(input_0_pt)
+
+        module.set_many_constants_with_tensors(
+            {"constant_1": constant_1_pt, "constant_2": constant_2_pt}
+        )
+        module.fold_constants(sync=False)
+        module.run_with_tensors([input_0_pt], [output_ait])
+
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/backend/test_profiler.py b/tests/unittest/backend/test_profiler.py
index a5f65a720..438df946d 100644
--- a/tests/unittest/backend/test_profiler.py
+++ b/tests/unittest/backend/test_profiler.py
@@ -14,20 +14,11 @@
 
 import unittest
 from random import randrange
-
-from aitemplate.backend import profiler_runner
-
-profiler_runner.extract_profile_result = lambda _: (
-    "",
-    False,
-)
-
 from time import sleep
+from unittest.mock import patch
 
 from aitemplate.backend.profiler_runner import ProfilerRunner
 
-from aitemplate.testing import detect_target
-
 
 def dice():
     return randrange(1, 10) / 4
@@ -56,7 +47,10 @@ def wrapped(result, delegate):
 
 class ProfilerTestCase(unittest.TestCase):
     def test_profiler_runner(self):
-        with detect_target() as _:
+        with patch(
+            "aitemplate.backend.profiler_runner.extract_profile_result"
+        ) as mock_extract_profile_result:
+            mock_extract_profile_result.return_value = ("", False)
             pr = ProfilerRunner(
                 devices=[str(i) for i in range(12)],
                 timeout=60,
diff --git a/tests/unittest/benchmark/test_gemm_benchmark.py b/tests/unittest/benchmark/test_gemm_benchmark.py
index 6fe2e4817..b1ac17049 100644
--- a/tests/unittest/benchmark/test_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_gemm_benchmark.py
@@ -14,6 +14,7 @@
 #
 import itertools
 import json
+import logging
 import unittest
 import uuid
 
@@ -27,7 +28,7 @@
 from aitemplate.testing.benchmark_ait import make_input_output_pools, run_benchmark
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
 from aitemplate.testing.benchmark_trt import make_trt_module
-from aitemplate.utils import logger, shape_utils
+from aitemplate.utils import shape_utils
 
 NK_SHAPES = ((8314, 3072), (6912, 8314))
 INPUT_POOL_SIZE = 20
@@ -37,6 +38,9 @@
 )
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class GemmRCRModule(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -181,7 +185,7 @@ class TestGemmRCRBenchmark(unittest.TestCase):
         detect_target(use_fp16_acc=True).in_ci_env(), "don't run benchmark in CI"
     )
     def test_benchmark(self):
-        split_ks = sorted(set(range(1, 6)).union([2 ** i for i in range(5)]))
+        split_ks = sorted(set(range(1, 6)).union([2**i for i in range(5)]))
         for split_k, (n, k) in itertools.product(split_ks, NK_SHAPES):
             NUM_ITERS = 100000
             NUM_WARMUP_ITERS = 1000
@@ -194,7 +198,7 @@ def test_benchmark(self):
             )
             for m in BATCH_SIZES:
                 mnk = {"m": m, "n": n, "k": k}
-                logger.warning(__name__, f"mnk={mnk}, split_k={split_k}")
+                _LOGGER.warning(f"mnk={mnk}, split_k={split_k}")
                 inputs_pool, outputs_pool = make_input_output_pools(
                     pool_size=INPUT_POOL_SIZE,
                     eval_pt_func=lambda: eval_pt_gemm_rcr(**mnk),
@@ -239,8 +243,7 @@ def test_benchmark(self):
                     "split_k": split_k,
                     **mnk,
                 }
-                logger.warning(
-                    __name__,
+                _LOGGER.warning(
                     f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
                 )
 
@@ -252,7 +255,7 @@ class TestBmmRRRBenchmark(unittest.TestCase):
     def test_benchmark(self):
         INPUT_POOL_SIZE = 3
         MNK_SHAPES = ((1469, 16, 128),)
-        split_ks = sorted(set(range(1, 6)).union([2 ** i for i in range(5)]))
+        split_ks = sorted(set(range(1, 6)).union([2**i for i in range(5)]))
         for split_k, (m, n, k) in itertools.product(split_ks, MNK_SHAPES):
             NUM_ITERS = 100000
             NUM_WARMUP_ITERS = 1000
@@ -266,7 +269,7 @@ def test_benchmark(self):
             )
             for b in BATCH_SIZES:
                 bmnk = {"b": b, "m": m, "n": n, "k": k}
-                logger.warning(__name__, f"bmnk={bmnk}, split_k={split_k}")
+                _LOGGER.warning(f"bmnk={bmnk}, split_k={split_k}")
                 inputs_pool, outputs_pool = make_input_output_pools(
                     pool_size=INPUT_POOL_SIZE,
                     eval_pt_func=lambda: eval_pt_bmm_rrr(**bmnk),
@@ -311,8 +314,7 @@ def test_benchmark(self):
                     "split_k": split_k,
                     **bmnk,
                 }
-                logger.warning(
-                    __name__,
+                _LOGGER.warning(
                     f"Benchmark results {json.dumps(benchmark_results, separators=(',', ':'))}",
                 )
 
diff --git a/tests/unittest/benchmark/test_group_gemm_benchmark.py b/tests/unittest/benchmark/test_group_gemm_benchmark.py
index d7a05ad1a..7222ae4ef 100644
--- a/tests/unittest/benchmark/test_group_gemm_benchmark.py
+++ b/tests/unittest/benchmark/test_group_gemm_benchmark.py
@@ -21,7 +21,8 @@
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
@@ -83,7 +84,7 @@ def _prepare_inputs(m, nk_groups, repeats=10, has_bias=True):
     inputs = []
     for _ in range(repeats):
         inputs.append([])
-        for (n, k) in nk_groups:
+        for n, k in nk_groups:
             x_pt = torch.randn(m, k).half().cuda()
             w_pt = torch.randn(n, k).half().cuda()
             b_pt = torch.randn(n).half().cuda()
@@ -209,7 +210,7 @@ def _benchmark(count, inputs_repeats, warmup, inputs, outputs, module, test_name
         module.run_with_tensors(inputs[i % inputs_repeats], outputs, sync=False)
     end_event.record()
     torch.cuda.synchronize()
-    logger.warning(
+    _LOGGER.warning(
         f"{test_name} benchmark, duration: {start_event.elapsed_time(end_event) / count}ms",
     )
 
@@ -224,7 +225,7 @@ def test_rcr(self):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning("Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
         X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
         X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
@@ -271,7 +272,7 @@ def _benchmark_rcr(
         test_name="",
         benchmark_non_group=False,
     ):
-        logger.warning(
+        _LOGGER.warning(
             f"{test_name} benchmark, m: {m}, nk groups: {nk_groups_1}, {nk_groups_2}",
         )
         WARMUP = 10000
@@ -315,7 +316,7 @@ def _benchmark_rcr(
             )
 
     def _benchmark_batch_rcr(self, b, m, n, k, test_name=""):
-        logger.warning(
+        _LOGGER.warning(
             f"{test_name} benchmark, b: {b}, m: {m}, n: {n}, k: {k}",
         )
         WARMUP = 10000
diff --git a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
index 4ecda179d..18d751a07 100644
--- a/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
+++ b/tests/unittest/benchmark/test_strided_layernorm_benchmark.py
@@ -28,6 +28,10 @@
 
 
 class TestStridedLayerNormBenchmark(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_id = 0
+
     @unittest.skipIf(detect_target().in_ci_env(), "don't run benchmark in CI")
     def test_benchmark(self):
         for (input_nonbatch_shape, (start_indices, end_indices),) in itertools.product(
@@ -51,8 +55,10 @@ def test_benchmark(self):
             ait_module = build_ait_module(
                 batch_sizes=(BATCH_SIZE,),
                 workdir=uuid.uuid4().hex,
+                test_id=self.test_id,
                 **_layernorm_common_params,
             )
+            self.test_id += 1
             inputs_pool, outputs_pool = make_input_output_pools(
                 pool_size=INPUT_POOL_SIZE,
                 eval_pt_func=lambda: eval_pt(
diff --git a/tests/unittest/compiler/test_compilation_failure.py b/tests/unittest/compiler/test_compilation_failure.py
new file mode 100644
index 000000000..6c2d7f56f
--- /dev/null
+++ b/tests/unittest/compiler/test_compilation_failure.py
@@ -0,0 +1,102 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+import unittest
+from unittest.mock import patch
+
+import jinja2
+from aitemplate.backend.build_cache_base import SkipBuildCache
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+
+
+class _EnableForceProfile:
+    """
+    Prevent cached profiling entries from causing profiling-related
+    compilation tests from failing.
+    """
+
+    def __init__(self):
+        self.old_force_profile = os.environ.get("FORCE_PROFILE", None)
+
+    def __enter__(self):
+        os.environ["FORCE_PROFILE"] = "1"
+        return self
+
+    def __exit__(self, *args):
+        if self.old_force_profile is None:
+            del os.environ["FORCE_PROFILE"]
+        else:
+            os.environ["FORCE_PROFILE"] = self.old_force_profile
+
+
+class CompilationFailureTestCase(unittest.TestCase):
+    def _test_compilation_failure(
+        self,
+        test_name="compilation_failure",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[IntImm(4), 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        with SkipBuildCache():
+            compile_model(
+                Y,
+                target,
+                f"./tmp/{test_name}",
+                test_name,
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
+
+    def test_compilation_failure_profiler(self):
+        target = detect_target().name()
+        with _EnableForceProfile():
+            profiler_main_template = (
+                f"aitemplate.backend.{target}.conv2d.common.PROFILER_MAIN_TEMPLATE"
+            )
+            with patch(profiler_main_template, jinja2.Template("BAD CODE!")):
+                with self.assertRaisesRegex(RuntimeError, "Build has failed."):
+                    self._test_compilation_failure(
+                        test_name="compilation_failure_profiler"
+                    )
+
+    def test_compilation_failure_function(self):
+        target = detect_target().name()
+        gen_function = f"aitemplate.backend.{target}.conv2d.common.gen_function"
+        with patch(gen_function) as mock_gen_function:
+            mock_gen_function.return_value = "BAD CODE!"
+            with self.assertRaisesRegex(RuntimeError, "Build has failed."):
+                self._test_compilation_failure(test_name="compilation_failure_function")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_constant_folding.py b/tests/unittest/compiler/test_constant_folding.py
index 6d1b22ae9..048fee2d5 100644
--- a/tests/unittest/compiler/test_constant_folding.py
+++ b/tests/unittest/compiler/test_constant_folding.py
@@ -12,21 +12,26 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import itertools
 import unittest
 
 import torch
 from aitemplate.compiler import compile_model, Model, ops
 
-from aitemplate.compiler.base import (
-    _create_host_zero_tensor,
-    _TorchConstantTensorData,
-    Tensor,
-)
+from aitemplate.compiler.base import _create_host_zero_tensor, Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.compiler.public import IntImm
 from aitemplate.compiler.transform.transform_utils import check_graph_validity
 
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConstantFoldingTestCase(unittest.TestCase):
@@ -37,26 +42,21 @@ def _verify_graph(
         graph_size = len(mod.debug_sorted_graph)
         self.assertEqual(graph_size, expected_num_nodes)
 
-        num_constants = sum(
-            1 for tensor in mod.debug_sorted_graph if tensor._attrs["data"] is not None
-        )
-        # Make sure the extra constants are deleted.
-        self.assertEqual(num_constants, expected_num_constants)
-
-    def test_simple_constant_fold(self):
+    @parameterized.expand([("float16")])
+    def test_simple_constant_fold(self, dtype):
         target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
-        inp0_pt = torch.randn((3, 3)).half().cuda()
-        inp1_pt = torch.randn((3, 3)).half().cuda()
-        inp2_pt = torch.randn((3, 3)).half().cuda()
+        inp0_pt = get_random_torch_tensor((3, 3), dtype)
+        inp1_pt = get_random_torch_tensor((3, 3), dtype)
+        inp2_pt = get_random_torch_tensor((3, 3), dtype)
         x_pt = inp0_pt * inp1_pt
         y_pt = (inp2_pt + x_pt).flatten()
 
-        inp0_ait = Tensor(shape=(3, 3), name="inp0")
-        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
-        inp1_ait = Tensor(shape=(3, 3), name="inp1")
-        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
-        inp2_ait = Tensor(shape=[3, 3], name="inp2", is_input=True)
+        inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
+        inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
+        inp2_ait = Tensor(shape=[3, 3], dtype=dtype, name="inp2", is_input=True)
 
         x_ait = ops.elementwise(FuncEnum.MUL)(inp0_ait, inp1_ait)
         # prevent mul/add fusion. If the ops get fused, then inp2_ait will be
@@ -67,10 +67,14 @@ def test_simple_constant_fold(self):
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
-        mod = compile_model(y_ait, target, "./tmp", "test_constant_folding_simple")
+        mod = compile_model(
+            y_ait, target, "./tmp", f"test_constant_folding_simple_{dtype}"
+        )
+        mod.set_constant_with_tensor("inp0", inp0_pt)
+        mod.set_constant_with_tensor("inp1", inp1_pt)
+        mod.fold_constants()
 
-        y = torch.empty((9,)).cuda().half()
+        y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({"inp2": inp2_pt}, {"y": y})
         self.assertTrue(torch.equal(y, y_pt))
 
@@ -79,68 +83,103 @@ def test_simple_constant_fold(self):
         # and add one constant, so the total size should be 3.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=3)
 
-    def test_pad_constant_weight(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_pad_constant_weight(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 3
-        w_pt = torch.randn((K, N)).half().cuda()
-        weight_data = _TorchConstantTensorData(w_pt)
-        input_0 = Tensor(shape=[M, K], name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], name="weight")
-        W._bind_data(weight_data)
+        w_pt = get_random_torch_tensor((K, N), dtype)
+        input_0 = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype=dtype, name="weight")
         Y = ops.gemm_rrr()(input_0, W)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
-        mod = compile_model(Y, target, "./tmp", "test_pad_constant_weight")
+        mod = compile_model(Y, target, "./tmp", f"test_pad_constant_weight_{dtype}")
+        mod.set_constant_with_tensor("weight", w_pt)
+        mod.fold_constants()
 
-        input_0_pt = torch.randn((M, K)).half().cuda()
+        input_0_pt = get_random_torch_tensor((M, K), dtype)
         y_pt = torch.matmul(input_0_pt, w_pt)
 
-        y = torch.empty((M, N)).cuda().half()
+        y = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({"input_0": input_0_pt}, {"y": y})
 
         torch.testing.assert_close(y, y_pt, atol=1e-1, rtol=1e-1)
 
-        # The apply_padding graph pass will add padding to both the input and the
-        # weight in this case with concatenate(). The concatenate for the weight
-        # will be folded, so we will be left with 2 constants.
-        self._verify_graph(mod, expected_num_constants=2, expected_num_nodes=5)
+        # For float16 inputs, the apply_padding graph pass will add padding to
+        # both the input and the weight in this case with concatenate().
+        # The concatenate for the weight will be folded, so we will be left with
+        # 2 constants.
+        if dtype == "float16":
+            expected_num_constants = 2
+            expected_num_nodes = 5
+        elif dtype == "float":
+            # Gemm ops with float inputs do not have any alignment requirements,
+            # so the apply_padding pass will not add any padding constants.
+            # The final graph only contains the original "weight" constant tensor.
+            expected_num_constants = 1
+            expected_num_nodes = 3
+        else:
+            raise RuntimeError(f"invalid {dtype=}")
+        self._verify_graph(
+            mod,
+            expected_num_constants=expected_num_constants,
+            expected_num_nodes=expected_num_nodes,
+        )
 
-    def test_fold_long_chain(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fold_long_chain(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         M, N, K = 16, 32, 3
-        w1_pt = torch.randn((K, N)).half().cuda()
-        w1_data = _TorchConstantTensorData(w1_pt)
+        w1_pt = get_random_torch_tensor((K, N), dtype)
 
-        w2_pt = torch.randn((K, N)).half().cuda()
-        w2_data = _TorchConstantTensorData(w2_pt)
+        w2_pt = get_random_torch_tensor((K, N), dtype)
 
         w3_pt = w1_pt * w2_pt
-        x_pt = torch.randn((M, K)).half().cuda()
-        x_pt_data = _TorchConstantTensorData(x_pt)
+        x_pt = get_random_torch_tensor((M, K), dtype)
 
         y_pt = torch.matmul(x_pt, w3_pt)
-        w4_pt = torch.randn((M, N)).half().cuda()
-        w4_data = _TorchConstantTensorData(w4_pt)
+        w4_pt = get_random_torch_tensor((M, N), dtype)
         z_pt = y_pt * w4_pt
 
-        w1_ait = Tensor(shape=[K, N], name="w1")
-        w1_ait._bind_data(w1_data)
-        w2_ait = Tensor(shape=[K, N], name="w2")
-        w2_ait._bind_data(w2_data)
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
-        x_ait = Tensor(shape=[M, K], name="x")
-        x_ait._bind_data(x_pt_data)
+        x_ait = Tensor(shape=[M, K], dtype=dtype, name="x")
         y_ait = ops.gemm_rrr()(x_ait, w3_ait)
-        w4_ait = Tensor(shape=[M, N], name="w4")
-        w4_ait._bind_data(w4_data)
+        w4_ait = Tensor(shape=[M, N], dtype=dtype, name="w4")
         z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
         z_ait._attrs["name"] = "z"
         z_ait._attrs["is_output"] = True
 
         target = detect_target()
-        mod = compile_model(z_ait, target, "./tmp", "test_pad_constant_weight")
-
-        z = torch.empty((M, N)).cuda().half()
+        mod = compile_model(z_ait, target, "./tmp", f"test_fold_long_chain_{dtype}")
+        mod.set_constant_with_tensor("w1", w1_pt)
+        mod.set_constant_with_tensor("w2", w2_pt)
+        mod.set_constant_with_tensor("x", x_pt)
+        mod.set_constant_with_tensor("w4", w4_pt)
+        mod.fold_constants()
+
+        z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
 
         torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
@@ -148,69 +187,94 @@ def test_fold_long_chain(self):
         # The entire graph is turned into a constant.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    def test_constant_folding_through_views(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_constant_folding_through_views(self, dtype):
         target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
-        inp0_pt = torch.randn((3, 3)).half().cuda()
-        inp1_pt = torch.randn((3, 3)).half().cuda()
+        inp0_pt = get_random_torch_tensor((3, 3), dtype)
+        inp1_pt = get_random_torch_tensor((3, 3), dtype)
         y_pt = (inp0_pt * inp1_pt).flatten()
 
-        inp0_ait = Tensor(shape=(3, 3), name="inp0")
-        inp0_ait._bind_data(_TorchConstantTensorData(inp0_pt))
-        inp1_ait = Tensor(shape=(3, 3), name="inp1")
-        inp1_ait._bind_data(_TorchConstantTensorData(inp1_pt))
+        inp0_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp0")
+        inp1_ait = Tensor(shape=(3, 3), dtype=dtype, name="inp1")
         inp0_view = ops.flatten()(inp0_ait)
         inp1_view = ops.flatten()(inp1_ait)
         y_ait = ops.elementwise(FuncEnum.MUL)(inp0_view, inp1_view)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         mod = compile_model(
-            y_ait, target, "./tmp", "test_constant_folding_through_views"
+            y_ait, target, "./tmp", f"test_constant_folding_through_views_{dtype}"
         )
+        mod.set_constant_with_tensor("inp0", inp0_pt)
+        mod.set_constant_with_tensor("inp1", inp1_pt)
+        mod.fold_constants()
 
-        y = torch.empty((9,)).cuda().half()
+        y = get_torch_empty_tensor((9,), dtype)
         mod.run_with_tensors({}, {"y": y})
         self.assertTrue(torch.equal(y, y_pt))
 
         # The entire graph is eliminated.
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
-    def test_late_binding(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_late_binding(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (target.name == "rocm" or int(target._arch) < 80):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         # Test binding constants through compile_model
         M, N, K = 16, 32, 3
-        w1_pt = torch.randn((K, N)).half().cuda()
+        w1_pt = get_random_torch_tensor((K, N), dtype)
 
-        w2_pt = torch.randn((K, N)).half().cuda()
+        w2_pt = get_random_torch_tensor((K, N), dtype)
 
         w3_pt = w1_pt * w2_pt
-        x_pt = torch.randn((M, K)).half().cuda()
+        x_pt = get_random_torch_tensor((M, K), dtype)
 
         y_pt = torch.matmul(x_pt, w3_pt)
-        w4_pt = torch.randn((M, N)).half().cuda()
+        w4_pt = get_random_torch_tensor((M, N), dtype)
         z_pt = y_pt * w4_pt
 
-        w1_ait = Tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         w3_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
-        x_ait = Tensor(shape=[M, K], name="x")
+        x_ait = Tensor(shape=[M, K], dtype=dtype, name="x")
         y_ait = ops.gemm_rrr()(x_ait, w3_ait)
-        w4_ait = Tensor(shape=[M, N], name="w4")
+        w4_ait = Tensor(shape=[M, N], dtype=dtype, name="w4")
         z_ait = ops.elementwise(FuncEnum.MUL)(y_ait, w4_ait)
         z_ait._attrs["name"] = "z"
         z_ait._attrs["is_output"] = True
 
-        target = detect_target()
         mod = compile_model(
             z_ait,
             target,
             "./tmp",
-            "test_late_binding",
-            constants={"w1": w1_pt, "w2": w2_pt, "x": x_pt, "w4": w4_pt},
+            f"test_late_binding_{dtype}",
         )
+        mod.set_constant_with_tensor("w1", w1_pt)
+        mod.set_constant_with_tensor("w2", w2_pt)
+        mod.set_constant_with_tensor("x", x_pt)
+        mod.set_constant_with_tensor("w4", w4_pt)
+        mod.fold_constants()
 
-        z = torch.empty((M, N)).cuda().half()
+        z = get_torch_empty_tensor((M, N), dtype)
         mod.run_with_tensors({}, {"z": z})
 
         torch.testing.assert_close(z, z_pt, atol=1e-1, rtol=1e-1)
@@ -219,15 +283,17 @@ def test_late_binding(self):
         self._verify_graph(mod, expected_num_constants=1, expected_num_nodes=1)
 
     def test_late_binding_error_constant_already_bound(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = _create_host_zero_tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = _create_host_zero_tensor(shape=[K, N], name="w1", dtype=dtype)
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Tensor w1 is already bound!"):
             compile_model(
                 y_ait,
@@ -235,21 +301,23 @@ def test_late_binding_error_constant_already_bound(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_error_cannot_bind_input(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = Tensor(shape=[K, N], name="w1", is_input=True)
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1", is_input=True)
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Cannot bind input tensor w1"):
             compile_model(
                 y_ait,
@@ -257,21 +325,23 @@ def test_late_binding_error_cannot_bind_input(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_error_cannot_bind_non_constant(self):
+        dtype = "float16"
+
         N, K = IntImm(16), IntImm(32)
-        w1_ait = Tensor(shape=[K, N], name="w1")
-        w2_ait = Tensor(shape=[K, N], name="w2")
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
         y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
         y_ait._attrs["name"] = "y"
         y_ait._attrs["is_output"] = True
 
-        target = detect_target()
         torch_shape = (K.value(), N.value())
+        target = detect_target()
         with self.assertRaisesRegex(ValueError, "Cannot bind non-constant tensor y"):
             compile_model(
                 y_ait,
@@ -279,14 +349,16 @@ def test_late_binding_error_cannot_bind_non_constant(self):
                 "./tmp",
                 "test_late_binding",
                 constants={
-                    "w1": torch.randn(torch_shape).cuda().half(),
-                    "w2": torch.randn(torch_shape).cuda().half(),
-                    "y": torch.randn(torch_shape).cuda().half(),
+                    "w1": get_random_torch_tensor(torch_shape, dtype),
+                    "w2": get_random_torch_tensor(torch_shape, dtype),
+                    "y": get_random_torch_tensor(torch_shape, dtype),
                 },
             )
 
     def test_late_binding_fails_wrong_dtype(self):
-        w1_ait = Tensor(shape=[1], name="w1", dtype="float16")
+        dtype = "float16"
+
+        w1_ait = Tensor(shape=[1], name="w1", dtype=dtype)
         y = ops.elementwise(FuncEnum.MUL)(w1_ait, w1_ait)
         y._attrs["name"] = "y"
         y._attrs["is_output"] = True
@@ -296,6 +368,7 @@ def test_late_binding_fails_wrong_dtype(self):
             torch.zeros((1,)).int(),
             torch.zeros((1,)).long(),
         )
+
         for w1_pt in wrong_inputs:
             with self.assertRaisesRegex(
                 ValueError,
@@ -310,6 +383,287 @@ def test_late_binding_fails_wrong_dtype(self):
                     constants={"w1": w1_pt},
                 )
 
+    def test_constant_folding_manual_call(self):
+        dtype = "float16"
+
+        N, K = IntImm(16), IntImm(32)
+        w1_ait = Tensor(shape=[K, N], dtype=dtype, name="w1")
+        w2_ait = Tensor(shape=[K, N], dtype=dtype, name="w2")
+        y_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, w2_ait)
+        y_ait._attrs["name"] = "y"
+        y_ait._attrs["is_output"] = True
+
+        shape = (K.value(), N.value())
+        w1_pt = get_random_torch_tensor(shape, dtype)
+        w2_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = w1_pt * w2_pt
+        y = torch.empty_like(y_pt)
+
+        with compile_model(
+            y_ait, detect_target(), "./tmp", "test_constant_folding_manual_call"
+        ) as mod:
+            # Unset constants
+            self.assertRaises(RuntimeError, mod.run_with_tensors, {}, [y])
+            self.assertRaises(RuntimeError, mod.fold_constants)
+
+            mod.set_many_constants_with_tensors({"w1": w1_pt, "w2": w2_pt})
+            mod.fold_constants()
+            mod.run_with_tensors({}, [y])
+            self.assertTrue(torch.equal(y_pt, y))
+
+    def test_constant_folding_mixed_usage(self):
+        """
+        Test a mix of all the ways to use constants:
+        - Unbound constant that is not folded
+        - Unbound constant folding input
+        - Bound constant folding input
+        - Bound constant that is not folded
+        """
+        dtype = "float16"
+
+        N, K = IntImm(13), IntImm(33)
+        input_0 = Tensor(shape=[N, N], dtype=dtype, name="input_0", is_input=True)
+
+        # Unbound, unfolded constant
+        w1_ait = Tensor(shape=[N, N], dtype=dtype, name="w1")
+
+        x1_ait = ops.elementwise(FuncEnum.MUL)(w1_ait, input_0)
+
+        # Unbound folded constant
+        w2_ait = Tensor(shape=[N, K], dtype=dtype, name="w2")
+
+        # Bound folded constants
+        w3_ait = Tensor(shape=[N, K], dtype=dtype, name="w3")
+        w4_ait = Tensor(shape=[N, K], dtype=dtype, name="w4")
+
+        x2_ait = ops.elementwise(FuncEnum.MUL)(w2_ait, w3_ait)
+        x3_ait = ops.gemm_rcr()(x2_ait, w4_ait)
+
+        x4_ait = ops.elementwise(FuncEnum.MUL)(x3_ait, x1_ait)
+
+        # Bound unfolded constant
+        w5_ait = Tensor(shape=[N, N], dtype=dtype, name="w5")
+        output = ops.elementwise(FuncEnum.MUL)(w5_ait, x4_ait)
+        output._attrs["is_output"] = True
+        output._attrs["name"] = "output"
+
+        input_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+        w1_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+        w2_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w3_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w4_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        w5_pt = get_random_torch_tensor((N.value(), N.value()), dtype)
+
+        x1_pt = w1_pt * input_pt
+        x2_pt = w2_pt * w3_pt
+        x3_pt = torch.nn.functional.linear(x2_pt, w4_pt)
+        x4_pt = x3_pt * x1_pt
+        output_pt = w5_pt * x4_pt
+
+        mod = compile_model(
+            output,
+            detect_target(),
+            "./tmp",
+            "test_constant_folding_mixed_usage",
+            constants={"w3": w3_pt, "w4": w4_pt, "w5": w5_pt},
+        )
+
+        self.assertSetEqual(
+            set(mod.get_constant_folding_input_names()),
+            # This is not the only input, but it's the only one we can set.
+            {"w2"},
+        )
+
+        self.assertSetEqual(set(mod.get_constant_names()), {"w1", "w2"})
+
+        output = torch.empty_like(output_pt)
+        # Unset constant W2
+        self.assertRaises(RuntimeError, mod.run_with_tensors, [input_pt], [output])
+        mod.set_constant_with_tensor("w2", w2_pt)
+        # Unset constant W1
+        self.assertRaises(RuntimeError, mod.run_with_tensors, [input_pt], [output])
+        mod.set_constant_with_tensor("w1", w1_pt)
+
+        mod.run_with_tensors([input_pt], [output])
+        torch.testing.assert_close(output, output_pt, atol=1e-1, rtol=1e-1)
+
+    def test_constant_folding_output_in_middle_of_chain(self):
+        dtype = "float16"
+        N, K = IntImm(13), IntImm(33)
+        x = Tensor(shape=[N, K], dtype=dtype, name="x")
+        y = Tensor(shape=[N.value() * K.value(), 1], dtype=dtype, name="y")
+
+        x2 = ops.reshape()(x, [N.value() * K.value(), 1])
+        x2._attrs["name"] = "x2"
+        # Special case: view of constant needed outside of constant folding
+        # subgraph.
+        x2._attrs["is_output"] = True
+
+        x3 = ops.elementwise(FuncEnum.MUL)(x2, y)
+        x3._attrs["name"] = "x3"
+        x3._attrs["is_output"] = True
+
+        x4 = ops.elementwise(FuncEnum.ADD)(x3, x3)
+        x4._attrs["name"] = "x4"
+        x4._attrs["is_output"] = True
+
+        mod = compile_model(
+            [x2, x3, x4],
+            detect_target(),
+            "./tmp",
+            "test_constant_folding_output_in_middle_of_chain",
+        )
+
+        x_pt = get_random_torch_tensor((N.value(), K.value()), dtype)
+        y_pt = get_random_torch_tensor((N.value() * K.value(), 1), dtype)
+        x2_pt = x_pt.reshape(N.value() * K.value(), 1)
+        x3_pt = x2_pt * y_pt
+        x4_pt = x3_pt + x3_pt
+
+        x2_ait, x3_ait, x4_ait = (
+            torch.empty_like(x2_pt),
+            torch.empty_like(x3_pt),
+            torch.empty_like(x4_pt),
+        )
+
+        mod.set_many_constants_with_tensors({"x": x_pt, "y": y_pt})
+        mod.run_with_tensors([], {"x2": x2_ait, "x3": x3_ait, "x4": x4_ait})
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                [True, False],
+                [True, False],
+                [True, False],
+                [True, False],
+                [True, False],
+            )
+        )
+    )
+    def test_constant_folding_with_update(
+        self,
+        update_model_bound: bool = False,
+        update_model_unbound: bool = False,
+        update_const_folder_bound: bool = False,
+        update_const_folder_unbound: bool = False,
+        double_buffer: bool = False,
+    ):
+        input_0 = Tensor(shape=[1, 2], dtype="float16", name="input_0", is_input=True)
+        constant_0 = Tensor(shape=[1, 2], dtype="float16", name="constant_0")
+        constant_1 = Tensor(shape=[1, 2], dtype="float16", name="constant_1")
+        constant_2 = Tensor(shape=[1, 2], dtype="float16", name="constant_2")
+        constant_3 = Tensor(shape=[1, 2], dtype="float16", name="constant_3")
+        constant_4 = Tensor(shape=[1, 2], dtype="float16", name="constant_4")
+        constant_5 = Tensor(shape=[1, 2], dtype="float16", name="constant_5")
+        constant_6 = Tensor(shape=[1, 2], dtype="float16", name="constant_6")
+        model_constants = {}
+        model_unbound_constants = {}
+        const_folder_constants = {}
+        const_folder_unbound_constants = {}
+
+        # constant 0/1/2 are not folded.
+        # constant 0 is unbounded, constant 1/2 is bounded.
+        x = ops.elementwise(FuncEnum.MUL)(input_0, constant_0)
+        x1 = ops.concatenate()([x, constant_1, constant_2])
+        model_constants["constant_1"] = get_random_torch_tensor((1, 2), "float16")
+        model_constants["constant_2"] = get_random_torch_tensor((1, 2), "float16")
+        model_unbound_constants["constant_0"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+
+        # constants 3/4/5/6 are folded.
+        # constants 3/4 are unbounded, constants 5/6 is bounded.
+        y = ops.elementwise(FuncEnum.MUL)(constant_3, constant_4)
+        y1 = ops.concatenate()([y, constant_5, constant_6])
+        const_folder_unbound_constants["constant_3"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_unbound_constants["constant_4"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_constants["constant_5"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+        const_folder_constants["constant_6"] = get_random_torch_tensor(
+            (1, 2), "float16"
+        )
+
+        output = ops.elementwise(FuncEnum.MUL)(x1, y1)
+        output._attrs["name"] = "output"
+        output._attrs["is_output"] = True
+
+        bound_constants = dict(model_constants, **const_folder_constants)
+        unbound_constants = dict(
+            model_unbound_constants, **const_folder_unbound_constants
+        )
+        mod = compile_model(
+            output,
+            detect_target(),
+            "./tmp",
+            f"test_constant_folding_{update_model_bound}_{update_model_unbound}_{update_const_folder_bound}_{update_const_folder_unbound}_{double_buffer}",
+            constants=bound_constants,
+        )
+
+        inp0_pt = get_random_torch_tensor((1, 2), "float16")
+
+        def _get_output(new_bound_constants, new_unbound_constants):
+            x_pt = inp0_pt * new_unbound_constants["constant_0"]
+            x1_pt = torch.cat(
+                (
+                    x_pt,
+                    new_bound_constants["constant_1"],
+                    new_bound_constants["constant_2"],
+                )
+            )
+            y = (
+                new_unbound_constants["constant_3"]
+                * new_unbound_constants["constant_4"]
+            )
+            y1_pt = torch.cat(
+                (
+                    y,
+                    new_bound_constants["constant_5"],
+                    new_bound_constants["constant_6"],
+                )
+            )
+            output_pt = x1_pt * y1_pt
+            return output_pt
+
+        output_pt = _get_output(bound_constants, unbound_constants)
+        output_ait = torch.empty_like(output_pt)
+        mod.set_many_constants_with_tensors(unbound_constants)
+        mod.run_with_tensors({"input_0": inp0_pt}, {"output": output_ait})
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
+        new_bound_constants = bound_constants
+        new_unbound_constants = unbound_constants
+        if update_model_bound:
+            for k in model_constants.keys():
+                new_bound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+        if update_model_unbound:
+            for k in model_unbound_constants.keys():
+                new_unbound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+
+        if update_const_folder_bound:
+            for k in const_folder_constants.keys():
+                new_bound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+        if update_const_folder_unbound:
+            for k in const_folder_unbound_constants.keys():
+                new_unbound_constants[k] = get_random_torch_tensor((1, 2), "float16")
+
+        if double_buffer:
+            mod.set_many_double_buffer_constants_with_tensors(new_bound_constants)
+            mod.set_many_double_buffer_constants_with_tensors(new_unbound_constants)
+        else:
+            mod.set_many_constants_with_tensors(new_bound_constants)
+            mod.set_many_constants_with_tensors(new_unbound_constants)
+        mod.fold_constants(double_buffer=double_buffer)
+        if double_buffer:
+            mod.swap_constants()
+        mod.run_with_tensors({"input_0": inp0_pt}, {"output": output_ait})
+        output_pt = _get_output(new_bound_constants, new_unbound_constants)
+        self.assertTrue(torch.equal(output_pt, output_ait))
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_eliminate_permutations.py b/tests/unittest/compiler/test_eliminate_permutations.py
new file mode 100644
index 000000000..6c3117da2
--- /dev/null
+++ b/tests/unittest/compiler/test_eliminate_permutations.py
@@ -0,0 +1,314 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    graph_has_op,
+)
+
+
+class EliminatePermutationTestCase(unittest.TestCase):
+    def test_eliminate_permutation(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        new_shape = [32, 64 * 112 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 3, 1, 2])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(new_shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "permute"))
+        self.assertTrue(torch.equal(torch.reshape(x_pt, new_shape), y_pt))
+
+    def test_eliminate_last_permutation(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 3, 1])
+        z = ops.permute()(p2, dims=[0, 3, 1, 2])
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertTrue(graph_has_op(result_graph, "permute"))
+
+    def test_eliminate_permutation_names(self):
+        dtype = "float"
+        shape = [32, 64, 112]
+        new_shape = [32, 64 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute021()(x)
+        p2 = ops.permute021()(p1)
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(new_shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation_names")
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "permute"))
+        self.assertTrue(torch.equal(torch.reshape(x_pt, new_shape), y_pt))
+
+    def test_eliminate_permutation_multiple_operations(self):
+        dtype = "float"
+        shape = [2, 4]
+        target = detect_target()
+
+        x0 = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x0, dims=[1, 0])
+        p2 = ops.permute()(p1, dims=[1, 0])
+        r = ops.reshape()(p1, shape)
+        z = ops.elementwise(FuncEnum.ADD)(r, p2)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertTrue(
+            torch.equal(
+                torch.reshape(torch.permute(x_pt, (1, 0)), shape) + x_pt,
+                y_pt,
+            )
+        )
+
+    def test_eliminate_permutation_multiple_operations_2(self):
+        dtype = "float"
+        shape = [2, 4]
+        target = detect_target()
+
+        x0 = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x0, dims=[1, 0])
+        p2 = ops.permute()(p1, dims=[1, 0])
+        r = ops.reshape()(p1, shape)
+        a1 = ops.elementwise(FuncEnum.ADD)(r, p2)
+        a2 = ops.elementwise(FuncEnum.ADD)(x0, p2)
+        z = ops.elementwise(FuncEnum.MUL)(a1, a2)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        y_pt = get_torch_empty_tensor(shape, dtype)
+
+        module = compile_model(z, target, "./tmp", "test_eliminate_permutation")
+        module.run_with_tensors({"x": x_pt}, {"z": y_pt})
+        self.assertTrue(
+            torch.equal(
+                (torch.reshape(torch.permute(x_pt, (1, 0)), shape) + x_pt) * 2 * x_pt,
+                y_pt,
+            )
+        )
+
+    def test_eliminate_permutation_different_shapes(self):
+        dtype = "float"
+        shape = [32, 64, 112, 112]
+        new_shape = [32, 64 * 112 * 112]
+        target = detect_target()
+
+        x = Tensor(shape, dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 3, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+        module = compile_model(
+            z, target, "./tmp", "test_eliminate_permutation_different_shapes"
+        )
+        result_graph = module.debug_sorted_graph
+        self.assertEqual(len(result_graph), 4)
+        self.assertTrue(graph_has_op(result_graph, "permute"))
+
+    def test_eliminate_permutation_all_permutations(self):
+        dtype = "float"
+        target = detect_target()
+        shape = [32, 64, 112, 112]
+
+        x = Tensor(shape, dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 3, 1])
+        p2 = ops.permute()(p1, dims=[0, 3, 1, 2])
+        p2._attrs["is_output"] = True
+
+        module = compile_model(
+            p2,
+            target,
+            "./tmp",
+            "test_eliminate_permutation_all_permutations",
+        )
+        result_graph = module.debug_sorted_graph
+        self.assertEqual(len(result_graph), 3)
+        self.assertTrue(graph_has_op(result_graph, "permute"))
+
+    def test_do_not_eliminate_permutation_of_strided_input(self):
+        dtype = "float"
+        shape = [3, 2, 4]
+        new_shape = [3, 2 * 2]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        s1 = ops.dynamic_slice()(
+            x, start_indices=[0, 0, 2], end_indices=[2147483647, 2147483647, 4]
+        )
+        p1 = ops.permute()(s1, dims=[0, 2, 1])
+        p2 = ops.permute()(p1, dims=[0, 2, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_strided_input"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor(new_shape, dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.reshape(torch.split(x_pt, 2, dim=2)[1], new_shape), z_pt
+                )
+            )
+
+    def test_do_not_eliminate_permutation_of_strided_input2(self):
+        dtype = "float"
+        shape = [3, 4, 2]
+        new_shape = [3, 2 * 2]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 1])
+        s1 = ops.dynamic_slice()(
+            p1, start_indices=[0, 0, 2], end_indices=[2147483647, 2147483647, 4]
+        )
+        p2 = ops.permute()(s1, dims=[0, 2, 1])
+        z = ops.reshape()(p2, new_shape)
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_strided_input2"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor(new_shape, dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.reshape(
+                        torch.permute(
+                            torch.split(torch.permute(x_pt, (0, 2, 1)), 2, dim=2)[1],
+                            (0, 2, 1),
+                        ),
+                        new_shape,
+                    ),
+                    z_pt,
+                )
+            )
+
+    def test_do_not_eliminate_permutation_of_reshaped_input(self):
+        dtype = "float"
+        shape = [3, 2, 4]
+        new_shape = [3, 2, 4]
+        target = detect_target()
+
+        x = Tensor(shape, name="x", dtype=dtype, is_input=True)
+        p1 = ops.permute()(x, dims=[0, 2, 1])
+        r1 = ops.reshape()(p1, new_shape)
+        p2 = ops.permute()(r1, dims=[0, 2, 1])
+        z = ops.dynamic_slice()(
+            p2, start_indices=[0, 0, 1], end_indices=[2147483647, 2147483647, 2]
+        )
+        z._attrs["is_output"] = True
+        z._attrs["name"] = "z"
+
+        with compile_model(
+            z, target, "./tmp", "test_do_not_eliminate_permutation_of_reshaped_input"
+        ) as module:
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 4)
+            self.assertTrue(graph_has_op(sorted_graph, "permute021"))
+
+            x_pt = get_random_torch_tensor(shape, dtype)
+            z_pt = get_torch_empty_tensor([3, 4, 1], dtype)
+
+            module.run_with_tensors({"x": x_pt}, {"z": z_pt})
+
+            self.assertTrue(
+                torch.equal(
+                    torch.split(
+                        torch.permute(
+                            torch.reshape(torch.permute(x_pt, (0, 2, 1)), new_shape),
+                            (0, 2, 1),
+                        ),
+                        1,
+                        dim=2,
+                    )[1],
+                    z_pt,
+                )
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_bmm_permute.py b/tests/unittest/compiler/test_fuse_bmm_permute.py
new file mode 100644
index 000000000..5360f4082
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_bmm_permute.py
@@ -0,0 +1,172 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import itertools
+import unittest
+from typing import Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from aitemplate.utils import shape_utils
+
+from parameterized import parameterized
+
+
+class FuseBmmPermuteCase(unittest.TestCase):
+    def _create_bmm_permute_graph(
+        self,
+        A_shape: Tuple[IntVar, int, int],
+        B_shape: Tuple[IntVar, int, int],
+        bmm_type: str,
+        dtype: str,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Create a graph consisting of bmm with given layout + permute021.
+        """
+        OP = getattr(ops, bmm_type, None)
+        assert OP is not None
+
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        inputs = [A, B]
+
+        Y = OP()(*inputs)
+        Y = ops.permute021()(Y)
+        Y._attrs["name"] = "target_bmm_tensor"
+        return A, B, Y
+
+    def _test_bmm_permute(
+        self,
+        B: int,
+        A_shape: Tuple[IntVar, int, int],
+        B_shape: Tuple[IntVar, int, int],
+        orig_layout: str,
+        dtype: str = "float16",
+    ):
+
+        is_row_major_a = orig_layout[0] == "r"
+        is_row_major_b = orig_layout[1] == "r"
+        is_row_major_c = orig_layout[2] == "r"
+
+        new_layout = orig_layout[:2] + ("c" if is_row_major_c else "r")
+        testname = f"{orig_layout}_to_{new_layout}_{dtype}"
+
+        original_bmm = f"bmm_{orig_layout}"
+        new_bmm = f"bmm_{new_layout}"
+
+        X, W, bmm_tensor = self._create_bmm_permute_graph(
+            A_shape,
+            B_shape,
+            original_bmm,
+            dtype,
+        )
+
+        output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        # Check that the new bmm is present and the original is not
+        exist_new_bmm = False
+        for tensor in module.debug_sorted_graph:
+            src_ops = tensor.src_ops()
+            if len(src_ops) == 0:
+                continue
+            assert (
+                len(src_ops) == 1
+            ), "Constructed graph should only have single-source op tensors."
+            src_op = list(tensor.src_ops())[0]
+            assert src_op._attrs["op"] != original_bmm
+
+            if src_op._attrs["op"] == new_bmm:
+                exist_new_bmm = True
+
+        assert exist_new_bmm, "Can't find converted bmm op in the graph."
+
+        m = A_shape[-2] if is_row_major_a else A_shape[-1]
+        n = B_shape[-1] if is_row_major_b else B_shape[-2]
+        k = B_shape[-2] if is_row_major_b else B_shape[-1]
+
+        # Check that fused graph produces correct output
+        for b in B:
+            # Compute PyTorch output
+            X_pt = get_random_torch_tensor((b, m, k), dtype)
+            W_pt = get_random_torch_tensor((b, k, n), dtype)
+            Y_pt = torch.matmul(X_pt, W_pt)
+            if is_row_major_c:
+                Y_pt = torch.transpose(Y_pt, 2, 1)
+            Y_pt = torch.cos(Y_pt)
+
+            # Compute AIT output
+            out_shape = [b, m, n] if not is_row_major_c else [b, n, m]
+            y = get_torch_empty_tensor(out_shape, dtype)
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0, 0]
+            if not is_row_major_a:
+                X_pt = torch.transpose(X_pt, 2, 1).contiguous()
+            if not is_row_major_b:
+                W_pt = torch.transpose(W_pt, 2, 1).contiguous()
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            module.run_with_tensors(inputs, [y])
+
+            torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    @parameterized.expand(
+        itertools.product(
+            [[1, 4]],  # Batch size
+            ["r", "c"],  # Layout of A
+            ["r", "c"],  # Layout of B
+            ["r", "c"],  # Layout of output
+            filter_test_cases_by_params(
+                {
+                    TestEnv.CUDA_LESS_THAN_SM80: ["float16"],
+                }
+            )["input"],
+        ),
+        skip_on_empty=True,
+    )
+    def test_xxr_to_xxс(self, B, layout_a, layout_b, layout_c, dtype):
+        """
+        Test that bmm_xxr + permute021 is fused into bmm_xxc and the other way round.
+        """
+        M, N, K = 4, 6, 8
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+
+        shape_a = [batch_dim, K, M] if layout_a == "c" else [batch_dim, M, K]
+        shape_b = [batch_dim, N, K] if layout_b == "c" else [batch_dim, K, N]
+
+        self._test_bmm_permute(
+            B,
+            shape_a,
+            shape_b,
+            layout_a + layout_b + layout_c,
+            dtype=dtype,
+        )
diff --git a/tests/unittest/compiler/test_fuse_cat_view_cat.py b/tests/unittest/compiler/test_fuse_cat_view_cat.py
new file mode 100644
index 000000000..ea11dde2f
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_cat_view_cat.py
@@ -0,0 +1,92 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.public import IntImm, IntVar
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class FuseCatViewCatTestCase(unittest.TestCase):
+    def test_fuse_cat_view_cat(self):
+        dtype = "float16"
+        B = IntVar([1, 2048], name="batch_size")
+        M1 = IntImm(16)
+        M2 = IntImm(48)
+        N = IntImm(18)
+        K = IntImm(9)
+
+        input_1 = Tensor(
+            shape=[B, M1, N],
+            name="input_1",
+            is_input=True,
+        )
+        input_2 = Tensor(
+            shape=[B, M2, N],
+            name="input_2",
+            is_input=True,
+        )
+        input_3 = Tensor(
+            shape=[B, K],
+            name="input_3",
+            is_input=True,
+        )
+        concatenate_4 = ops.concatenate()([input_1, input_2], 1)
+        reshape_5 = ops.reshape()(
+            concatenate_4, [-1, (M1.value() + M2.value()) * N.value()]
+        )
+        concatenate_6 = ops.concatenate()([input_3, reshape_5], 1)
+
+        # Set outputs
+        concatenate_6._attrs["name"] = "output_0"
+        concatenate_6._attrs["is_output"] = True
+        # Compile
+        mod = compile_model(
+            concatenate_6,
+            detect_target(),
+            "./tmp",
+            "test_fuse_cat_view_cat",
+        )
+        # Compare
+        input_1_pt = get_random_torch_tensor((1024, M1.value(), N.value()), dtype)
+        input_2_pt = get_random_torch_tensor((1024, M2.value(), N.value()), dtype)
+        input_3_pt = get_random_torch_tensor((1024, K.value()), dtype)
+
+        y_pt = torch.cat(
+            [
+                input_3_pt,
+                torch.reshape(
+                    torch.cat([input_1_pt, input_2_pt], dim=1),
+                    (-1, (M1.value() + M2.value()) * N.value()),
+                ),
+            ],
+            dim=1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        mod.run_with_tensors(
+            {"input_1": input_1_pt, "input_2": input_2_pt, "input_3": input_3_pt},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_expand.py b/tests/unittest/compiler/test_fuse_expand.py
index dcf3f67a9..655c944be 100644
--- a/tests/unittest/compiler/test_fuse_expand.py
+++ b/tests/unittest/compiler/test_fuse_expand.py
@@ -55,8 +55,7 @@ def test_fuse_expand_elementwise(self, exact_match: bool, name: str):
 
                 z_ait = torch.empty_like(z_pt)
                 mod.run_with_tensors({"x": x_pt, "y": y_pt}, {"z": z_ait})
-
-                self.assertTrue(torch.equal(z_ait, z_pt))
+                self.assertTrue(torch.equal(z_ait, z_pt), f"{z_ait=}\n{z_pt=}")
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_fuse_expand_bmm.py b/tests/unittest/compiler/test_fuse_expand_bmm.py
new file mode 100644
index 000000000..938106645
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_expand_bmm.py
@@ -0,0 +1,754 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class FuseExpandBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(FuseExpandBmmTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _compile_and_check(
+        self, Y, test_name, expected_num_ops, expected_op, no_expand=True
+    ):
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+        if expected_num_ops == 1:
+            self.assertEqual(sorted_ops[0]._attrs["op"], expected_op)
+        elif no_expand:
+            self.assertTrue(
+                all(lambda op: op._attrs["op"] != "expand" for op in sorted_ops)
+            )
+        return module
+
+    def _test_non_fusible_expand_bmm_1(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([B, K, N])
+        # Y0 = expand(x0, shape_1[B, M, K])
+        # Y1 = bmm_rrr(Y_0, x1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x1", is_input=True)
+
+        Y0 = ops.expand()(X0, [batch_dim, -1, -1])
+        Y0._attrs["name"] = "output0"
+        Y0._attrs["is_output"] = True
+
+        Y1 = ops.bmm_rrr()(Y0, X1)
+        Y1._attrs["name"] = "output1"
+        Y1._attrs["is_output"] = True
+        module = self._compile_and_check(
+            [Y0, Y1], test_name, expected_num_ops, "bmm_rrr", no_expand=False
+        )
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([batch, K, N], dtype)
+            y0_pt = x0_pt.expand(batch, -1, -1)
+            y1_pt = torch.matmul(y0_pt, x1_pt)
+
+            y0 = get_torch_empty_tensor(y0_pt.size(), dtype)
+            y1 = get_torch_empty_tensor(y1_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y0, y1])
+            torch.testing.assert_close(y0_pt, y0, atol=0.1, rtol=0.1)
+            torch.testing.assert_close(y1_pt, y1, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_expand_bmm_1(self):
+        self._test_non_fusible_expand_bmm_1(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_non_fusible_expand_bmm_1",
+        )
+
+    def _test_non_fusible_expand_bmm_2(
+        self,
+        B,
+        M,
+        N,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, N])
+        # x1 = tensor([B, N, N])
+        # expand_0 = expand(x0, shape_1[B, M, N])
+        # bmm_rrr_1 = bmm_rrr(expand_0, x1)
+        # Y = add(expand_0, bmm_rrr_1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, N], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, N, N], dtype=dtype, name="x1", is_input=True)
+
+        expand_0 = ops.expand()(X0, [batch_dim, -1, -1])
+        bmm_rrr_1 = ops.bmm_rrr()(expand_0, X1)
+        Y = ops.elementwise(FuncEnum.ADD)(expand_0, bmm_rrr_1)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(
+            Y, test_name, expected_num_ops, "bmm_rrr", no_expand=False
+        )
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, N, N], dtype)
+            expand_0_pt = x0_pt.expand(batch, -1, -1)
+            bmm_rrr_1_pt = torch.matmul(expand_0_pt, x1_pt)
+            y_pt = expand_0_pt + bmm_rrr_1_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_expand_bmm_2(self):
+        self._test_non_fusible_expand_bmm_1(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_non_fusible_expand_bmm_1",
+        )
+
+    def _test_fuse_expand_bmm_rrr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([B, K, N])
+        # expand_0 = expand(x0, shape_1[B, M, K])
+        # Y = bmm_rrr(expand_0, x1)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x1", is_input=True)
+
+        expand_0 = ops.expand()(X0, [batch_dim, -1, -1])
+        Y = ops.bmm_rrr()(expand_0, X1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([batch, K, N], dtype)
+            expand_0_pt = x0_pt.expand(batch, -1, -1)
+            y_pt = torch.matmul(expand_0_pt, x1_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rrr_a(self):
+        self._test_fuse_expand_bmm_rrr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,  # one extra permute
+            test_name="test_fuse_expand_bmm_rrr_a",
+        )
+        self._test_fuse_expand_bmm_rrr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=1,
+            test_name="test_fuse_expand_bmm_rrr_a",
+        )
+
+    def _test_fuse_expand_bmm_rrc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, K, N])
+        # x2 = tensor([B, N, M])
+        # expand_0 = expand(x1, shape_1[B, K, N])
+        # Y = bmm_rrc_add(x0, expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, N], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x2", is_input=True)
+
+        expand_0 = ops.expand()(X1, [batch_dim, -1, -1])
+        Y = ops.bmm_rrc_add()(X0, expand_0, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, K, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, M], dtype)
+            expand_0_pt = x1_pt.expand(batch, -1, -1)
+            y_pt = torch.matmul(x0_pt, expand_0_pt)
+            y_pt = y_pt.transpose(2, 1) + x2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rrc_add_b(self):
+        self._test_fuse_expand_bmm_rrc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=3,  # two extra concat
+            test_name="test_fuse_expand_bmm_rrc_add_b",
+        )
+        self._test_fuse_expand_bmm_rrc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=1,
+            test_name="test_fuse_expand_bmm_rrc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_crr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, K, M])
+        # x1 = tensor([1, K, M])
+        # x2 = tensor([B, K, N])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, K, M])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, M], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_crr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_crr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, K, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_tran_pt, x2_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_crr_a(self):
+        self._test_fuse_expand_bmm_crr_a(
+            B=10,
+            M=5,
+            N=12,
+            K=11,
+            expected_num_ops=4,  # extra concat and slice
+            test_name="test_fuse_expand_bmm_crr_a",
+        )
+        self._test_fuse_expand_bmm_crr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_crr_a",
+        )
+
+    def _test_fuse_expand_bmm_crc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, K, N])
+        # x2 = tensor([1, K, N])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, K, N])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, N], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, K, N], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_crc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_crc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, N], dtype)
+            x2_pt = get_random_torch_tensor([1, K, N], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            x0_tran_pt = torch.transpose(x0_pt, 2, 1)
+            y_pt = torch.matmul(x0_tran_pt, expand_1_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_crc_add_b(self):
+        self._test_fuse_expand_bmm_crc_add_b(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=5,  # two extra concat and one slice
+            test_name="test_fuse_expand_bmm_crc_add_b",
+        )
+        self._test_fuse_expand_bmm_crc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_crc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_rcr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([1, M, K])
+        # x2 = tensor([B, N, K])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, M, K])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, M, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_rcr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rcr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, M, K], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, K], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            x2_tran_pt = torch.transpose(x2_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_pt, x2_tran_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rcr_a(self):
+        self._test_fuse_expand_bmm_rcr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=4,
+            test_name="test_fuse_expand_bmm_rcr_a",
+        )
+        self._test_fuse_expand_bmm_rcr_a(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_rcr_a",
+        )
+
+    def _test_fuse_expand_bmm_rcc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, M, K])
+        # x1 = tensor([1, N, K])
+        # x2 = tensor([1, N, K])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, N, K])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, N, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, N, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_rcc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rcc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, N, K], dtype)
+            x2_pt = get_random_torch_tensor([1, N, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            y_pt = torch.matmul(x0_pt, expand_1_tran_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_rcc_add_b(self):
+        self._test_fuse_expand_bmm_rcc_add_b(
+            B=10,
+            M=6,
+            N=12,
+            K=5,
+            expected_num_ops=4,  # two extra concat
+            test_name="test_fuse_expand_bmm_rcc_add_b",
+        )
+        self._test_fuse_expand_bmm_rcc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_rcc_add_b",
+        )
+
+    def _test_fuse_expand_bmm_ccr_a(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, K, M])
+        # x1 = tensor([1, K, M])
+        # x2 = tensor([B, N, K])
+        # add_0 = x0 + x1
+        # expand_0 = expand(add_0, shape_1[B, K, M])
+        # Y = bmm_rrr(expand_0, x2)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, K, M], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="x2", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_ccr()(expand_1, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_ccr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, K, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, N, K], dtype)
+            add_0_pt = x0_pt + x1_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            x2_tran_pt = torch.transpose(x2_pt, 2, 1)
+            y_pt = torch.matmul(expand_1_tran_pt, x2_tran_pt)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_ccr_a(self):
+        self._test_fuse_expand_bmm_ccr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=3,  # one extra permute
+            test_name="test_fuse_expand_bmm_ccr_a",
+        )
+        self._test_fuse_expand_bmm_ccr_a(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_ccr_a",
+        )
+
+    def _test_fuse_expand_bmm_ccc_add_b(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([B, K, M])
+        # x1 = tensor([1, N, K])
+        # x2 = tensor([1, N, K])
+        # x3 = tensor([B, N, M])
+        # add_0 = x1 + x2
+        # expand_0 = expand(add_0, shape_1[B, N, K])
+        # Y = bmm_rrc_add(x0, expand_0, x3)
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, N, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, N, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, N, M], dtype=dtype, name="x3", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        expand_1 = ops.expand()(add_0, [batch_dim, -1, -1])
+        Y = ops.bmm_ccc_add()(X0, expand_1, X3)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_ccc_add")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([batch, K, M], dtype)
+            x1_pt = get_random_torch_tensor([1, N, K], dtype)
+            x2_pt = get_random_torch_tensor([1, N, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, N, M], dtype)
+            add_0_pt = x1_pt + x2_pt
+            expand_1_pt = add_0_pt.expand(batch, -1, -1)
+            expand_1_tran_pt = torch.transpose(expand_1_pt, 2, 1)
+            x0_tran_pt = torch.transpose(x0_pt, 2, 1)
+            y_pt = torch.matmul(x0_tran_pt, expand_1_tran_pt)
+            y_pt = y_pt.transpose(2, 1) + x3_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_expand_bmm_ccc_add_b(self):
+        self._test_fuse_expand_bmm_ccc_add_b(
+            B=10,
+            M=5,
+            N=12,
+            K=6,
+            expected_num_ops=5,  # two extra concat and one slice
+            test_name="test_fuse_expand_bmm_ccc_add_b",
+        )
+        self._test_fuse_expand_bmm_ccc_add_b(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=2,
+            test_name="test_fuse_expand_bmm_ccc_add_b",
+        )
+
+    def _test_fuse_size_expand_bmm_rrr(
+        self,
+        B,
+        M,
+        N,
+        K,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor([1, M, K])
+        # x1 = tensor([1, M, K])
+        # x2 = tensor([1, M, K])
+        # x3 = tensor([B, K, N])
+        # x4 = tensor([B, K, N])
+        # add_0 = x3 + x4
+        # size_1, _, _ = size(add_0)
+        # expand_2 = expand(x0, size_1)
+        # expand_3 = expand(x1, size_1)
+        # expand_4 = expand(x2, size_1)
+        # bmm_5 = bmm_rrr(expand_2, add_0)
+        # bmm_6 = bmm_rrr(expand_3, add_0)
+        # bmm_7 = bmm_rrr(expand_4, add_0)
+        # add_8 = bmm_5 + bmm_6
+        # Y = bmm_7 + add_8
+        batch_sizes = [1, B]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(shape=[1, M, K], dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=[1, M, K], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[1, M, K], dtype=dtype, name="x2", is_input=True)
+        X3 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x3", is_input=True)
+        X4 = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="x4", is_input=True)
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X3, X4)
+        size_1, _, _ = ops.size()(add_0)
+        expand_to_shape = [size_1, -1, -1]
+        expand_2 = ops.expand()(X0, expand_to_shape)
+        expand_3 = ops.expand()(X1, expand_to_shape)
+        expand_4 = ops.expand()(X2, expand_to_shape)
+        bmm_5 = ops.bmm_rrr()(expand_2, add_0)
+        bmm_6 = ops.bmm_rrr()(expand_3, add_0)
+        bmm_7 = ops.bmm_rrr()(expand_4, add_0)
+        add_8 = ops.elementwise(FuncEnum.ADD)(bmm_5, bmm_6)
+        Y = ops.elementwise(FuncEnum.ADD)(bmm_7, add_8)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+        module = self._compile_and_check(Y, test_name, expected_num_ops, "bmm_rrr")
+
+        for batch in [1, B]:
+            x0_pt = get_random_torch_tensor([1, M, K], dtype)
+            x1_pt = get_random_torch_tensor([1, M, K], dtype)
+            x2_pt = get_random_torch_tensor([1, M, K], dtype)
+            x3_pt = get_random_torch_tensor([batch, K, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, K, N], dtype)
+            add_0_pt = x3_pt + x4_pt
+            size_1 = batch
+            expand_2_pt = x0_pt.expand(size_1, -1, -1)
+            expand_3_pt = x1_pt.expand(size_1, -1, -1)
+            expand_4_pt = x2_pt.expand(size_1, -1, -1)
+            bmm_5_pt = torch.matmul(expand_2_pt, add_0_pt)
+            bmm_6_pt = torch.matmul(expand_3_pt, add_0_pt)
+            bmm_7_pt = torch.matmul(expand_4_pt, add_0_pt)
+            add_8_pt = bmm_5_pt + bmm_6_pt
+            y_pt = bmm_7_pt + add_8_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_size_expand_bmm_rrr(self):
+        self._test_fuse_size_expand_bmm_rrr(
+            B=10,
+            M=4,
+            N=12,
+            K=11,
+            expected_num_ops=7,
+            test_name="test_fuse_size_expand_bmm_rrr",
+        )
+        self._test_fuse_size_expand_bmm_rrr(
+            B=10,
+            M=4,
+            N=12,
+            K=6,
+            expected_num_ops=4,
+            test_name="test_fuse_size_expand_bmm_rrr",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_mm_elementwise.py b/tests/unittest/compiler/test_fuse_mm_elementwise.py
index b5136b961..4ff44426d 100644
--- a/tests/unittest/compiler/test_fuse_mm_elementwise.py
+++ b/tests/unittest/compiler/test_fuse_mm_elementwise.py
@@ -20,18 +20,34 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
 
+from parameterized import parameterized
+
+
+def custom_name_func(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        param.args[-2],
+    )
+
 
 class FuseGemmRcrBiasCase(unittest.TestCase):
-    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         if decomposed:
             gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -41,12 +57,14 @@ def _build_gemm_rcr_bias(self, M, N, K, decomposed):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_add_add_relu_chain(self, M, N, K, depth, decomposed):
+    def _build_gemm_rcr_bias_add_add_relu_chain(
+        self, M, N, K, depth, decomposed, dtype
+    ):
         D_shape = [M, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_4", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         if depth == 1:
             return bias_tensor
 
@@ -64,19 +82,19 @@ def _build_gemm_rcr_bias_add_add_relu_chain(self, M, N, K, depth, decomposed):
 
         raise AssertionError("No suitable output tensors available")
 
-    def _build_gemm_rcr_bias_mul(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias_mul(self, M, N, K, decomposed, dtype):
         D_shape = [M, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
 
         return mul_tensor
 
-    def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         bias_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 1, decomposed
+            m_dim, N, K, 1, decomposed, dtype
         )
         bias_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(bias_tensor)
@@ -98,9 +116,9 @@ def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -108,15 +126,15 @@ def _test_gemm_rcr_bias(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_0"]] = X_pt
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         add_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 2, decomposed
+            m_dim, N, K, 2, decomposed, dtype
         )
         add_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add_tensor)
@@ -138,10 +156,10 @@ def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt)
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -150,15 +168,17 @@ def _test_gemm_rcr_bias_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_add(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         add2_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 3, decomposed
+            m_dim, N, K, 3, decomposed, dtype
         )
         add2_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add2_tensor)
@@ -180,11 +200,11 @@ def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
             )
@@ -197,14 +217,16 @@ def _test_gemm_rcr_bias_add_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_add_relu(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         relu_tensor = self._build_gemm_rcr_bias_add_add_relu_chain(
-            m_dim, N, K, 4, decomposed
+            m_dim, N, K, 4, decomposed, dtype
         )
         relu_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(relu_tensor)
@@ -226,11 +248,11 @@ def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_add_relu")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.relu(
                     torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt + D1_pt
@@ -245,17 +267,29 @@ def _test_gemm_rcr_bias_add_add_relu(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_add_fail(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_gemm_rcr_bias_add_fail(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         B_shape = [N]
 
-        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=B_shape, dtype=dtype, name="input_3", is_input=True)
 
-        gemm_bias_tensor = self._build_gemm_rcr_bias(M, N, K, False)
+        gemm_bias_tensor = self._build_gemm_rcr_bias(M, N, K, False, dtype)
         gemm_bias_tensor._attrs["name"] = "gemm_tensor"
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_bias_tensor, input_3)
         add_tensor._attrs["name"] = "gemm_bias_add_tensor"
@@ -265,8 +299,9 @@ def test_gemm_rcr_bias_add_fail(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_add_fail")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_add_fail_{dtype}"
+        )
 
         # This shouldn't be merged into gemm_rcr_bias_add since input_3 needs broadcasting
         check_tensor = None
@@ -278,32 +313,44 @@ def test_gemm_rcr_bias_add_fail(self):
         src_op = list(check_tensor.src_ops())[0]
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias")
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + B1_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt, B1_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_chained(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_gemm_rcr_bias_chained(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
         add_tensor._attrs["name"] = "first_gemm"
 
         D_shape = [N, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
         gemm1_tensor = ops.gemm_universal.gemm_rcr()(add_tensor, input_3)
         add1_tensor = ops.elementwise(FuncEnum.ADD)(gemm1_tensor, input_2)
         add1_tensor._attrs["name"] = "second_gemm"
@@ -313,8 +360,9 @@ def test_gemm_rcr_bias_chained(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_chained")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_chained_{dtype}"
+        )
 
         gemm_check = [False, False]
         for tensor in module.debug_sorted_graph:
@@ -328,29 +376,41 @@ def test_gemm_rcr_bias_chained(self):
                 gemm_check[1] = True
         self.assertTupleEqual(tuple(gemm_check), (True, True))
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        D_pt = torch.randn(N, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([N, N], dtype)
         Y_pt = torch.cos(
             torch.nn.functional.linear(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt), D_pt, bias=B_pt
             )
         )
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt, D_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_fail(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_gemm_rcr_bias_fail(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M, N, K = 16, 32, 8
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [M, N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(gemm_tensor, input_2)
@@ -361,8 +421,9 @@ def test_gemm_rcr_bias_fail(self):
         output._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "gemm_bias_fusion_fail")
+        module = compile_model(
+            output, target, "./tmp", f"gemm_bias_fusion_fail_{dtype}"
+        )
 
         check_tensor = None
         for tensor in module.debug_sorted_graph:
@@ -374,22 +435,24 @@ def test_gemm_rcr_bias_fail(self):
                 break
         self.assertIsNotNone(check_tensor)
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(M, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([M, N], dtype)
         Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt) + B_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors([X_pt, W_pt, B_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_add_relu(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
 
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bias_tensor, input_3)
         relu_tensor = ops.elementwise(FuncEnum.RELU)(add_tensor)
         relu_tensor._attrs["name"] = "final_tensor"
@@ -412,10 +475,10 @@ def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_add_relu")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.relu(
                     torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D0_pt
@@ -429,14 +492,14 @@ def _test_gemm_rcr_bias_add_relu(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(bias_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
@@ -458,9 +521,9 @@ def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
             )
@@ -471,14 +534,14 @@ def _test_gemm_rcr_bias_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname, dtype="float16"):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         mul_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(mul_tensor)
         output._attrs["name"] = "output_0"
@@ -499,10 +562,10 @@ def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -512,16 +575,18 @@ def _test_gemm_rcr_bias_mul(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul_add(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
 
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_4", is_input=True)
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_4", is_input=True)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         add_tensor = ops.elementwise(FuncEnum.ADD)(mul_tensor, input_4)
         add_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(add_tensor)
@@ -543,11 +608,11 @@ def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_add")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
-            D1_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
+            D1_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt + D1_pt
             )
@@ -560,14 +625,16 @@ def _test_gemm_rcr_bias_mul_add(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_3"]] = D0_pt
             inputs[input_name_to_index["input_4"]] = D1_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_mul_tanh(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
 
-        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed)
+        mul_tensor = self._build_gemm_rcr_bias_mul(m_dim, N, K, decomposed, dtype)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(tanh_tensor)
@@ -589,10 +656,10 @@ def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_mul_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D0_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D0_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) * D0_pt)
             )
@@ -604,7 +671,7 @@ def _test_gemm_rcr_bias_mul_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D0_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -700,16 +767,111 @@ def test_gemm_rcr_bias_mul_tanh(self):
             [8], 16, 3, False, "gemm_rcr_bias_mul_tanh_need_align"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_rcr_bias,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_basic_decomposed_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_add_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add,
+                [8, 32],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_add_add_dynamic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_add_relu,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_add_add_relu_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_add_relu,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_add_relu_basic_decomposed_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_tanh,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_tanh_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul,
+                [8, 32],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_mul_dynamic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul_add,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_mul_add_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_mul_tanh,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_mul_tanh_need_align_float",
+                "float",
+            ),
+        ],
+        name_func=custom_name_func,
+    )
+    def test_gemm_rcr_bias_add_float_sm80(
+        self, func, Ms, N, K, decomposed, testname, dtype
+    ):
+        func(self, Ms, N, K, decomposed, testname, dtype)
+
+
+filter_test_cases_by_test_env(FuseGemmRcrBiasCase)
+
 
 class FuseGemmRcrBiasActivationCase(unittest.TestCase):
-    def _build_gemm_rcr_bias(self, M, N, K, decomposed):
+    def _build_gemm_rcr_bias(self, M, N, K, decomposed, dtype):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
-        input_0 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         if decomposed:
             gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -719,14 +881,14 @@ def _build_gemm_rcr_bias(self, M, N, K, decomposed):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_sigmoid(self, M, N, K, decomposed):
-        gemm_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed)
+    def _build_gemm_rcr_bias_sigmoid(self, M, N, K, decomposed, dtype):
+        gemm_tensor = self._build_gemm_rcr_bias(M, N, K, decomposed, dtype)
         sigmoid_tensor = ops.elementwise(FuncEnum.SIGMOID)(gemm_tensor)
 
         return sigmoid_tensor
 
     def _test_gemm_rcr_bias_activation(
-        self, Ms, N, K, activation, target_ait, decomposed, testname
+        self, Ms, N, K, activation, target_ait, decomposed, testname, dtype="float16"
     ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         if activation == "relu":
@@ -747,7 +909,7 @@ def _test_gemm_rcr_bias_activation(
         else:
             raise AssertionError("Activation not supported")
 
-        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed)
+        bias_tensor = self._build_gemm_rcr_bias(m_dim, N, K, decomposed, dtype)
         act_tensor = ops.elementwise(ait_func)(bias_tensor)
         act_tensor._attrs["name"] = "final_tensor"
         output = ops.elementwise(FuncEnum.COS)(act_tensor)
@@ -769,9 +931,9 @@ def _test_gemm_rcr_bias_activation(
         self.assertEqual(src_op._attrs["op"], target_ait)
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.cos(pt_func(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)))
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -779,17 +941,21 @@ def _test_gemm_rcr_bias_activation(
             inputs[input_name_to_index["input_0"]] = X_pt
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = B_pt
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
 
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_sigmoid_mul(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(
+            m_dim, N, K, decomposed, dtype
+        )
         mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
         mul_tensor._attrs["name"] = "final_tensor"
 
@@ -812,10 +978,10 @@ def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, B_pt)) * D_pt
             )
@@ -827,16 +993,20 @@ def _test_gemm_rcr_bias_sigmoid_mul(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
+    def _test_gemm_rcr_bias_sigmoid_mul_tanh(
+        self, Ms, N, K, decomposed, testname, dtype="float16"
+    ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         D_shape = [m_dim, N]
-        input_3 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_3 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
-        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(m_dim, N, K, decomposed)
+        sigmoid_tensor = self._build_gemm_rcr_bias_sigmoid(
+            m_dim, N, K, decomposed, dtype
+        )
         mul_tensor = ops.elementwise(FuncEnum.MUL)(sigmoid_tensor, input_3)
         tanh_tensor = ops.elementwise(FuncEnum.TANH)(mul_tensor)
         tanh_tensor._attrs["name"] = "final_tensor"
@@ -860,10 +1030,10 @@ def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_sigmoid_mul_tanh")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             Y_pt = torch.cos(
                 torch.tanh(
                     torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
@@ -878,7 +1048,7 @@ def _test_gemm_rcr_bias_sigmoid_mul_tanh(self, Ms, N, K, decomposed, testname):
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1032,20 +1202,108 @@ def test_gemm_rcr_bias_gelu(self):
             "gemm_rcr_bias_fast_gelu_basic_decomposed",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_relu_basic_decomposed_float",
+                "float",
+                "relu",
+                "gemm_rcr_bias_relu",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_sigmoid_basic_float",
+                "float",
+                "sigmoid",
+                "gemm_rcr_bias_sigmoid",
+            ),
+            (
+                _test_gemm_rcr_bias_sigmoid_mul,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_sigmoid_mul_basic_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_sigmoid_mul_tanh,
+                [8],
+                16,
+                3,
+                False,
+                "gemm_rcr_bias_sigmoid_mul_tanh_need_align_float",
+                "float",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8],
+                16,
+                8,
+                False,
+                "gemm_rcr_bias_tanh_basic_float",
+                "float",
+                "tanh",
+                "gemm_rcr_bias_tanh",
+            ),
+            (
+                _test_gemm_rcr_bias_activation,
+                [8, 32],
+                16,
+                8,
+                True,
+                "gemm_rcr_bias_fast_gelu_basic_decomposed_float",
+                "float",
+                "fast_gelu",
+                "gemm_rcr_bias_fast_gelu",
+            ),
+        ],
+        name_func=custom_name_func,
+    )
+    def test_gemm_rcr_bias_float_sm80(
+        self,
+        func,
+        Ms,
+        N,
+        K,
+        decomposed,
+        testname,
+        dtype,
+        activation=None,
+        target_ait=None,
+    ):
+        if activation and target_ait:
+            func(self, Ms, N, K, activation, target_ait, decomposed, testname, dtype)
+        else:
+            func(self, Ms, N, K, decomposed, testname, dtype)
+
+
+filter_test_cases_by_test_env(FuseGemmRcrBiasActivationCase)
+
 
 class FuseGemmRcrBiasSwishCase(unittest.TestCase):
     def _test_gemm_rcr_bias_swish(
-        self, Ms, N, K, testname, use_add=False, use_silu=False
+        self, Ms, N, K, testname, dtype="float16", use_add=False, use_silu=False
     ):
         m_dim = shape_utils.gen_int_var_min_max(Ms, name="M_size")
         X_shape = [m_dim, K]
         W_shape = [N, K]
         B_shape = [N]
         D_shape = [m_dim, N]
-        input_1 = Tensor(shape=X_shape, dtype="float16", name="input_0", is_input=True)
-        input_2 = Tensor(shape=W_shape, dtype="float16", name="input_1", is_input=True)
-        input_3 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
-        input_4 = Tensor(shape=D_shape, dtype="float16", name="input_3", is_input=True)
+        input_1 = Tensor(shape=X_shape, dtype=dtype, name="input_0", is_input=True)
+        input_2 = Tensor(shape=W_shape, dtype=dtype, name="input_1", is_input=True)
+        input_3 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
+        input_4 = Tensor(shape=D_shape, dtype=dtype, name="input_3", is_input=True)
 
         if use_add:
             tensor = ops.gemm_rcr()(input_1, input_2)
@@ -1080,10 +1338,10 @@ def _test_gemm_rcr_bias_swish(
         self.assertEqual(src_op._attrs["op"], "gemm_rcr_bias_swish")
 
         for M in Ms:
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
-            D_pt = torch.randn(M, N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            D_pt = get_random_torch_tensor([M, N], dtype)
             gemm_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
             Y_pt = gemm_pt * torch.sigmoid(gemm_pt) + D_pt
 
@@ -1094,7 +1352,7 @@ def _test_gemm_rcr_bias_swish(
             inputs[input_name_to_index["input_2"]] = B_pt
             inputs[input_name_to_index["input_3"]] = D_pt
 
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
@@ -1107,27 +1365,61 @@ def test_gemm_rcr_bias_swish(self):
         )
 
     def test_gemm_rcr_add_swish(self):
-        self._test_gemm_rcr_bias_swish([8], 16, 8, "gemm_rcr_add_swish_basic", True)
         self._test_gemm_rcr_bias_swish(
-            [8, 32], 16, 8, "gemm_rcr_add_swish_dynamic", True
+            [8], 16, 8, "gemm_rcr_add_swish_basic", use_add=True
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8, 32], 16, 8, "gemm_rcr_add_swish_dynamic", use_add=True
         )
         self._test_gemm_rcr_bias_swish(
-            [8], 16, 3, "gemm_rcr_add_swish_need_align", True
+            [8], 16, 3, "gemm_rcr_add_swish_need_align", use_add=True
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gemm_rcr_swish_float_sm80(self):
+        self._test_gemm_rcr_bias_swish(
+            [8],
+            16,
+            8,
+            "gemm_rcr_bias_swish_basic_float",
+            dtype="float",
+        )
+        self._test_gemm_rcr_bias_swish(
+            [8, 32],
+            16,
+            8,
+            "gemm_rcr_add_swish_dynamic_float",
+            dtype="float",
+            use_add=True,
         )
         self._test_gemm_rcr_bias_swish(
-            [8], 16, 3, "gemm_rcr_add_silu_basic", True, True
+            [8],
+            16,
+            3,
+            "gemm_rcr_add_swish_need_align_float",
+            dtype="float",
+            use_add=True,
         )
 
 
+filter_test_cases_by_test_env(FuseGemmRcrBiasSwishCase)
+
+
 class FuseBmmCcrAddCase(unittest.TestCase):
-    def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_ccr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, K, M]
         B_shape = [batch_dim, N, K]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1146,14 +1438,20 @@ def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_ccr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, K, M).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1162,18 +1460,18 @@ def _test_bmm_ccr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_bmm_ccr_add_negative(self, testname, negative_type):
+    def _test_bmm_ccr_add_negative(self, testname, negative_type, dtype="float16"):
         B, K, M, N = 8, 32, 16, 8
         A_shape = [B, K, M]
         B_shape = [B, N, K]
         D0_shape = [B, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         bmm_tensor._attrs["name"] = "bmm_tensor"
         if negative_type == "is_output":
@@ -1203,9 +1501,9 @@ def _test_bmm_ccr_add_negative(self, testname, negative_type):
         src_op = list(check_tensor.src_ops())[0]
         self.assertEqual(src_op._attrs["op"], "bmm_ccr")
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D0_pt = torch.randn(B, M, N).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D0_pt = get_random_torch_tensor([B, M, N], dtype)
 
         bmm_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1))
         Y_pt = bmm_pt + D0_pt + D0_pt
@@ -1220,8 +1518,8 @@ def _test_bmm_ccr_add_negative(self, testname, negative_type):
         inputs[input_name_to_index["input_1"]] = W_pt
         inputs[input_name_to_index["input_2"]] = D0_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
-        y1 = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        y1 = get_torch_empty_tensor([B, M, N], dtype)
         output_name_to_index = module.get_output_name_to_index_map()
         if output_name_to_index["output_0"] == 0:
             ys = [y, y1]
@@ -1236,26 +1534,60 @@ def test_bmm_ccr_add(self):
         self._test_bmm_ccr_add([8], 32, 16, 8, "bmm_ccr_add_basic")
         self._test_bmm_ccr_add([8, 32], 32, 16, 8, "bmm_ccr_add_dynamic")
         self._test_bmm_ccr_add([8], 7, 13, 3, "bmm_ccr_add_need_align")
+        self._test_bmm_ccr_add(
+            [8], 32, 16, 8, "bmm_ccr_add_do_not_fuse", do_not_fuse=True
+        )
 
     def test_bmm_ccr_add_negative(self):
         self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_output", "is_output")
         self._test_bmm_ccr_add_negative("bmm_ccr_add_negative_input", "other_input")
 
-    def test_bmm_ccr_add_double_shared_input(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_ccr_add_float_sm80(self):
+        self._test_bmm_ccr_add(
+            [8, 32], 32, 16, 8, "bmm_ccr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_ccr_add(
+            [8], 7, 13, 3, "bmm_ccr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_ccr_add(
+            [8],
+            32,
+            16,
+            8,
+            "bmm_ccr_add_do_not_fuse_float",
+            dtype="float",
+            do_not_fuse=True,
+        )
+        self._test_bmm_ccr_add_negative(
+            "bmm_ccr_add_negative_output", "is_output", dtype="float"
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_bmm_ccr_add_double_shared_input(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         B, M, N, K = 8, 32, 16, 8
 
         A_shape = [B, K, M]
         B_shape = [B, N, K]
         D0_shape = [B, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_11 = Tensor(
-            shape=B_shape, dtype="float16", name="input_11", is_input=True
-        )
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_11 = Tensor(shape=B_shape, dtype=dtype, name="input_11", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_ccr()(input_0, input_1)
         bmm_tensor_1 = ops.gemm_universal.bmm_ccr()(input_0, input_11)
 
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
         add_tensor_1 = ops.elementwise(FuncEnum.ADD)(bmm_tensor_1, input_2)
@@ -1269,9 +1601,8 @@ def test_bmm_ccr_add_double_shared_input(self):
         output_1._attrs["is_output"] = True
 
         # Check value correctness
-        target = detect_target()
         module = compile_model(
-            [output, output_1], target, "./tmp", "bmm_ccr_double_shared_inputs"
+            [output, output_1], target, "./tmp", f"bmm_ccr_double_shared_inputs_{dtype}"
         )
 
         check_tensor = None
@@ -1288,10 +1619,10 @@ def test_bmm_ccr_add_double_shared_input(self):
             self.assertEqual(src_op._attrs["op"], "bmm_ccr_add")
             check_tensor = None
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        W1_pt = torch.randn(B, N, K).cuda().half()
-        D0_pt = torch.randn(B, M, N).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        W1_pt = get_random_torch_tensor([B, N, K], dtype)
+        D0_pt = get_random_torch_tensor([B, M, N], dtype)
         Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt.transpose(2, 1)) + D0_pt + D0_pt
         Y1_pt = torch.bmm(X_pt.transpose(2, 1), W1_pt.transpose(2, 1)) + D0_pt + D0_pt
 
@@ -1302,8 +1633,8 @@ def test_bmm_ccr_add_double_shared_input(self):
         inputs[input_name_to_index["input_11"]] = W1_pt
         inputs[input_name_to_index["input_2"]] = D0_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
-        y1 = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        y1 = get_torch_empty_tensor([B, M, N], dtype)
         ys = [None] * 2
         output_name_to_index = module.get_output_name_to_index_map()
         ys[output_name_to_index["output_0"]] = y
@@ -1315,15 +1646,24 @@ def test_bmm_ccr_add_double_shared_input(self):
         self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(FuseBmmCcrAddCase)
+
+
 class FuseBmmCrrAddCase(unittest.TestCase):
-    def _test_bmm_crr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_crr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, K, M]
         B_shape = [batch_dim, K, N]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_crr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1342,14 +1682,20 @@ def _test_bmm_crr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_crr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, K, M).cuda().half()
-            W_pt = torch.randn(B, K, N).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt.transpose(2, 1), W_pt) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1358,7 +1704,7 @@ def _test_bmm_crr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1366,17 +1712,41 @@ def test_bmm_crr_add(self):
         self._test_bmm_crr_add([8], 32, 16, 8, "bmm_crr_add_basic")
         self._test_bmm_crr_add([8, 32], 32, 16, 8, "bmm_crr_add_dynamic")
         self._test_bmm_crr_add([8], 7, 13, 3, "bmm_crr_add_need_align")
+        self._test_bmm_crr_add(
+            [8], 32, 16, 8, "bmm_crr_add_do_not_fuse", do_not_fuse=True
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_crr_add_float_sm80(self):
+        self._test_bmm_crr_add(
+            [8, 32], 32, 16, 8, "bmm_crr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_crr_add(
+            [8], 7, 13, 3, "bmm_crr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_crr_add(
+            [8], 32, 16, 8, "bmm_crr_add_do_not_fuse", dtype="float", do_not_fuse=True
+        )
+
+
+filter_test_cases_by_test_env(FuseBmmCrrAddCase)
 
 
 class FuseBmmRrrAddCase(unittest.TestCase):
-    def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
+    def _test_bmm_rrr_add(
+        self, Bs, M, N, K, testname, dtype="float16", do_not_fuse=False
+    ):
         batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
         A_shape = [batch_dim, M, K]
         B_shape = [batch_dim, K, N]
-        D0_shape = [batch_dim, M, N]
-        input_0 = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        input_1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        input_2 = Tensor(shape=D0_shape, dtype="float16", name="input_2", is_input=True)
+        if do_not_fuse:
+            assert M != 1
+            D0_shape = [batch_dim, 1, N]
+        else:
+            D0_shape = [batch_dim, M, N]
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
         bmm_tensor = ops.gemm_universal.bmm_rrr()(input_0, input_1)
         add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
         add_tensor._attrs["name"] = "add_tensor"
@@ -1395,14 +1765,20 @@ def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
                 continue
             if src_ops[0]._attrs["op"].startswith("bmm"):
                 check_tensor = tensor
-                self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
+                if do_not_fuse:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr")
+                else:
+                    self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
                 break
         self.assertIsNotNone(check_tensor)
 
+        if do_not_fuse:
+            return
+
         for B in Bs:
-            X_pt = torch.randn(B, M, K).cuda().half()
-            W_pt = torch.randn(B, K, N).cuda().half()
-            D0_pt = torch.randn(B, M, N).cuda().half()
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor([B, M, N], dtype)
             Y_pt = torch.bmm(X_pt, W_pt) + D0_pt + D0_pt
 
             input_name_to_index = module.get_input_name_to_index_map()
@@ -1411,7 +1787,7 @@ def _test_bmm_rrr_add(self, Bs, M, N, K, testname):
             inputs[input_name_to_index["input_1"]] = W_pt
             inputs[input_name_to_index["input_2"]] = D0_pt
 
-            y = torch.empty([B, M, N]).cuda().half()
+            y = get_torch_empty_tensor([B, M, N], dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -1419,7 +1795,79 @@ def test_bmm_rrr_add(self):
         self._test_bmm_rrr_add([8], 32, 16, 8, "bmm_rrr_add_basic")
         self._test_bmm_rrr_add([8, 32], 32, 16, 8, "bmm_rrr_add_dynamic")
         self._test_bmm_rrr_add([8], 7, 13, 3, "bmm_rrr_add_need_align")
+        self._test_bmm_rrr_add([8], 32, 16, 8, "bmm_rrr_add_no_fuse", do_not_fuse=True)
+
+    def _test_bmm_rrr_bias_add(
+        self, Bs, M, N, K, bias_shapes, testname, dtype="float16"
+    ):
+        batch_dim = shape_utils.gen_int_var_min_max(Bs, name="batch_size")
+        A_shape = [batch_dim, M, K]
+        B_shape = [batch_dim, K, N]
+        D0_shape = bias_shapes
+
+        input_0 = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        input_1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        input_2 = Tensor(shape=D0_shape, dtype=dtype, name="input_2", is_input=True)
+        bmm_tensor = ops.gemm_universal.bmm_rrr()(input_0, input_1)
+        add_tensor = ops.elementwise(FuncEnum.ADD)(bmm_tensor, input_2)
+        add_tensor._attrs["name"] = "add_tensor"
+        output = ops.elementwise(FuncEnum.ADD)(add_tensor, input_2)
+        output._attrs["name"] = "output_0"
+        output._attrs["is_output"] = True
+
+        # Check value correctness
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", testname)
+
+        check_tensor = None
+        for tensor in module.debug_sorted_graph:
+            src_ops = list(tensor.src_ops())
+            if len(src_ops) != 1:
+                continue
+            if src_ops[0]._attrs["op"].startswith("bmm"):
+                check_tensor = tensor
+                self.assertEqual(src_ops[0]._attrs["op"], "bmm_rrr_add")
+        self.assertIsNotNone(check_tensor)
+
+        for B in Bs:
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
+            D0_pt = get_random_torch_tensor(D0_shape, dtype)
+            Y_pt = torch.bmm(X_pt, W_pt) + D0_pt + D0_pt
+
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [None] * 3
+            inputs[input_name_to_index["input_0"]] = X_pt
+            inputs[input_name_to_index["input_1"]] = W_pt
+            inputs[input_name_to_index["input_2"]] = D0_pt
+
+            y = get_torch_empty_tensor([B, M, N], dtype)
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_bmm_rrr_bias_add(self):
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [16], "bmm_rrr_bias_add_01")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [32, 16], "bmm_rrr_bias_add_02")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [1, 32, 16], "bmm_rrr_bias_add_03")
+        self._test_bmm_rrr_bias_add([8], 32, 16, 8, [1, 16], "bmm_rrr_bias_add_03")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_rrr_add_float_sm80(self):
+        self._test_bmm_rrr_add(
+            [8, 32], 32, 16, 8, "bmm_rrr_add_dynamic_float", dtype="float"
+        )
+        self._test_bmm_rrr_add(
+            [8], 7, 13, 3, "bmm_rrr_add_need_align_float", dtype="float"
+        )
+        self._test_bmm_rrr_add(
+            [8], 32, 16, 8, "bmm_rrr_add_no_fuse_float", dtype="float", do_not_fuse=True
+        )
+        self._test_bmm_rrr_bias_add(
+            [8], 32, 16, 8, [1, 32, 16], "bmm_rrr_bias_add_float_03", dtype="float"
+        )
+
 
+filter_test_cases_by_test_env(FuseBmmRrrAddCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
index edea2d2d8..5e65c2581 100644
--- a/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
+++ b/tests/unittest/compiler/test_fuse_mm_reshape_permute.py
@@ -19,12 +19,21 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import has_op
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMReshapePermuteTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_rcr_0213(
         self,
         ms,
@@ -32,6 +41,7 @@ def _test_rcr_0213(
         n,
         shape,
         test_name,
+        dtype="float16",
         has_bias=False,
         layout="0213",
         should_fuse=True,
@@ -39,11 +49,11 @@ def _test_rcr_0213(
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         # B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
         t1, t2 = shape
 
@@ -67,9 +77,9 @@ def _test_rcr_0213(
             return
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
 
             def torch_f(x, w, b, has_bias, shape):
                 if has_bias:
@@ -87,7 +97,7 @@ def torch_f(x, w, b, has_bias, shape):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -99,12 +109,12 @@ def torch_f(x, w, b, has_bias, shape):
             # )
             # print(f"pt: {t} ms/iter")
 
-    def test_rcr_0213(self):
+    def test_rcr_0213_sm80(self):
         self._test_rcr_0213(
             [54],
             256,
-            4000000,
-            [54, 1000000],
+            40000,
+            [54, 10000],
             "permute_0213_1",
             has_bias=False,
             layout="0213",
@@ -112,14 +122,29 @@ def test_rcr_0213(self):
         self._test_rcr_0213(
             [29, 29 * 8],
             256,
-            300000,
-            [29, 100000],
+            3000,
+            [29, 1000],
             "permute_0213_2",
             has_bias=False,
             layout="0213",
             should_fuse=False,
         )
 
+    def test_rcr_0213_float_sm80(self):
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            3000,
+            [29, 1000],
+            "permute_0213_float_2",
+            dtype="float",
+            has_bias=False,
+            layout="0213",
+            should_fuse=False,
+        )
+
+
+filter_test_cases_by_test_env(GEMMReshapePermuteTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_ops.py b/tests/unittest/compiler/test_fuse_ops.py
new file mode 100644
index 000000000..329c78b1a
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_ops.py
@@ -0,0 +1,90 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class TestFuseGroupnormSwish(unittest.TestCase):
+    def test_fused(self):
+        x_shape = [3, 3, 1, 4]
+        num_groups = 2
+        num_channels = x_shape[-1]
+        dtype = "float16"
+        eps = 1e-5
+
+        X1 = Tensor(
+            shape=x_shape,
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[num_channels],
+            dtype=dtype,
+            name="gamma",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[num_channels],
+            dtype=dtype,
+            name="beta",
+            is_input=True,
+        )
+
+        op_name = "group_norm"
+        OP = getattr(ops, op_name)(num_groups, num_channels)
+
+        X4 = OP(X1, X2, X3, eps)
+        X5 = ops.elementwise(FuncEnum.SIGMOID)(X4)
+        X6 = ops.elementwise(FuncEnum.MUL)(X5)
+        X6._attrs["is_output"] = True
+        X6._attrs["name"] = "output"
+
+        target = detect_target()
+        dll_name = "test_0.so"
+        module = compile_model(X6, target, "./tmp", op_name, dll_name=dll_name)
+
+        x1_nhwc_pt = get_random_torch_tensor(x_shape, dtype)
+        x1_nchw_pt = x1_nhwc_pt.permute(0, 3, 1, 2).contiguous()
+        gamma_pt = get_random_torch_tensor((num_channels,), dtype)
+        beta_pt = torch.randn_like(gamma_pt)
+
+        x6_pt = torch.nn.functional.group_norm(
+            x1_nchw_pt, num_groups, gamma_pt, beta_pt, eps=eps
+        )
+
+        x6_pt = torch.nn.SiLU()(x6_pt)
+
+        inputs = {"X": x1_nhwc_pt}
+        inputs["gamma"] = gamma_pt
+        inputs["beta"] = beta_pt
+        x6 = torch.empty_like(x1_nhwc_pt)
+        module.run_with_tensors(inputs, [x6])
+
+        torch.testing.assert_close(
+            x6, x6_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_bmm.py b/tests/unittest/compiler/test_fuse_permute_bmm.py
index b510acf7b..47dd69e7c 100644
--- a/tests/unittest/compiler/test_fuse_permute_bmm.py
+++ b/tests/unittest/compiler/test_fuse_permute_bmm.py
@@ -20,18 +20,41 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
 
+from parameterized import parameterized
+
+
+def custom_name_func_with_testname(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        param.args[-2],
+    )
+
+
+def custom_name_func_with_funcname(testcase_func, param_num, param):
+    return "%s_%s_sm80" % (
+        testcase_func.__name__[:-5],
+        str(param.args[0].__name__),
+    )
+
 
 class FusePermuteBmmCase(unittest.TestCase):
     def _create_permute_bmm_graph(
-        self, A_shape, B_shape, bmm_type, permA, permB, bias_shape=None
+        self, A_shape, B_shape, bmm_type, permA, permB, dtype, bias_shape=None
     ):
         OP = getattr(ops, bmm_type, None)
         assert OP is not None
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         X = A
         W = B
         if permA:
@@ -41,7 +64,7 @@ def _create_permute_bmm_graph(
         inputs = [A, B]
         if bias_shape is not None:
             inputs.append(
-                Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+                Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
             )
 
         Y = OP()(*inputs)
@@ -49,10 +72,10 @@ def _create_permute_bmm_graph(
         return X, W, Y
 
     def _test_missing_alignment_bmm(
-        self, A_shape, B_shape, bmm_type, permA, permB, testname
+        self, A_shape, B_shape, bmm_type, permA, permB, testname, dtype="float16"
     ):
         X, W, bmm_tensor = self._create_permute_bmm_graph(
-            A_shape, B_shape, bmm_type, permA, permB
+            A_shape, B_shape, bmm_type, permA, permB, dtype
         )
         output = ops.elementwise(FuncEnum.COS)(bmm_tensor)
         output._attrs["name"] = "output_0"
@@ -61,6 +84,21 @@ def _test_missing_alignment_bmm(
         target = detect_target()
         module = compile_model(output, target, "./tmp", testname)
 
+        if dtype == "float":
+            expected_bmm_type = list(bmm_type)
+            if permA:
+                if expected_bmm_type[-3] == "c":
+                    expected_bmm_type[-3] = "r"
+                else:
+                    expected_bmm_type[-3] = "c"
+            if permB:
+                if expected_bmm_type[-2] == "c":
+                    expected_bmm_type[-2] = "r"
+                else:
+                    expected_bmm_type[-2] = "c"
+            expected_bmm_type = "".join(expected_bmm_type)
+        else:
+            expected_bmm_type = bmm_type
         found_tensor = False
         for tensor in module.debug_sorted_graph:
             src_ops = tensor.src_ops()
@@ -72,7 +110,7 @@ def _test_missing_alignment_bmm(
             src_op = list(tensor.src_ops())[0]
             if src_op._attrs["op"].startswith("bmm"):
                 found_tensor = True
-                self.assertEqual(src_op._attrs["op"], bmm_type)
+                self.assertEqual(src_op._attrs["op"], expected_bmm_type)
         self.assertTrue(found_tensor)
 
     def test_misalign_a_bmm(self):
@@ -97,6 +135,77 @@ def test_misalign_b_bmm(self):
             [2, 4, 8], [2, 8, 7], "bmm_rcr", False, True, "bmm_rcr_misalign_b"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @parameterized.expand(
+        [
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 7, 8],
+                "bmm_crr",
+                True,
+                False,
+                "bmm_crr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 8, 4],
+                "bmm_rcr",
+                True,
+                False,
+                "bmm_rcr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 7],
+                [2, 4, 8],
+                "bmm_rrr",
+                True,
+                False,
+                "bmm_rrr_misalign_a",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 8, 4],
+                [2, 8, 7],
+                "bmm_ccr",
+                False,
+                True,
+                "bmm_ccr_misalign_b",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 7, 8],
+                [2, 8, 7],
+                "bmm_crr",
+                False,
+                True,
+                "bmm_crr_misalign_b",
+                "float",
+            ),
+            (
+                _test_missing_alignment_bmm,
+                [2, 4, 8],
+                [2, 8, 7],
+                "bmm_rcr",
+                False,
+                True,
+                "bmm_rcr_misalign_b",
+                "float",
+            ),
+        ],
+        name_func=custom_name_func_with_testname,
+    )
+    def test_misalign_bmm_float_sm80(
+        self, func, A_shape, B_shape, bmm_type, permA, permB, testname, dtype
+    ):
+        func(self, A_shape, B_shape, bmm_type, permA, permB, testname, dtype)
+
     def _test_permute_bmm(
         self,
         B,
@@ -105,6 +214,7 @@ def _test_permute_bmm(
         original_bmm,
         new_bmm,
         testname,
+        dtype="float16",
         bias_shape=None,
     ):
         new_layout = new_bmm[-3:]
@@ -133,6 +243,7 @@ def _test_permute_bmm(
             original_bmm,
             permA,
             permB,
+            dtype,
             bias_shape=bias_shape,
         )
 
@@ -163,19 +274,19 @@ def _test_permute_bmm(
 
         for b in B:
             if len(A_shape) > 2:
-                X_pt = torch.randn(b, M, K).cuda().half()
+                X_pt = get_random_torch_tensor([b, M, K], dtype)
             else:
-                X_pt = torch.randn(M, K).cuda().half()
+                X_pt = get_random_torch_tensor([M, K], dtype)
 
             if len(B_shape) > 2:
-                W_pt = torch.randn(b, K, N).cuda().half()
+                W_pt = get_random_torch_tensor([b, K, N], dtype)
             else:
-                W_pt = torch.randn(K, N).cuda().half()
+                W_pt = get_random_torch_tensor([K, N], dtype)
 
             Y_pt = torch.matmul(X_pt, W_pt)
 
             if bias_shape is not None:
-                bias_pt = torch.randn(bias_shape[0]).cuda().half()
+                bias_pt = get_random_torch_tensor(bias_shape[0], dtype)
                 Y_pt += bias_pt
 
             Y_pt = torch.cos(Y_pt)
@@ -188,7 +299,7 @@ def _test_permute_bmm(
                 W_pt = W_pt.permute(perm).contiguous()
 
             # We currently only have row-major outputs.
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
 
             input_name_to_index = module.get_input_name_to_index_map()
             inputs = [0, 0] if bias_shape is None else [0, 0, 0]
@@ -226,6 +337,20 @@ def test_ccr_to_rrr(self):
             "ccr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_ccr_to_rrr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_ccr",
+            "bmm_rrr",
+            "ccr_to_rrr_need_align_float",
+            dtype="float",
+        )
+
     def test_ccr_to_crr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -252,6 +377,20 @@ def test_ccr_to_crr(self):
             "ccr_to_crr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_ccr_to_crr_float_sm80(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 4, 8],
+            "bmm_ccr",
+            "bmm_crr",
+            "ccr_to_crr_dynamic_float",
+            dtype="float",
+        )
+
     def test_ccr_to_rcr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -278,6 +417,20 @@ def test_ccr_to_rcr(self):
             "ccr_to_rcr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_ccr_to_rcr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 8, 4],
+            "bmm_ccr",
+            "bmm_rcr",
+            "ccr_to_rcr_float",
+            dtype="float",
+        )
+
     def test_crr_to_ccr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -304,6 +457,20 @@ def test_crr_to_ccr(self):
             "crr_to_ccr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_crr_to_ccr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_crr",
+            "bmm_ccr",
+            "crr_to_ccr_float",
+            dtype="float",
+        )
+
     def test_crr_to_rrr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -330,6 +497,20 @@ def test_crr_to_rrr(self):
             "crr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_crr_to_rrr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 3, 5],
+            [batch_dim, 5, 7],
+            "bmm_crr",
+            "bmm_rrr",
+            "crr_to_rrr_need_align_float",
+            dtype="float",
+        )
+
     def test_rcr_to_ccr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -356,6 +537,20 @@ def test_rcr_to_ccr(self):
             "rcr_to_ccr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_rcr_to_ccr_float_sm80(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 4, 2],
+            [batch_dim, 8, 4],
+            "bmm_rcr",
+            "bmm_ccr",
+            "rcr_to_ccr_dynamic_float",
+            dtype="float",
+        )
+
     def test_rcr_to_rrr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -382,6 +577,20 @@ def test_rcr_to_rrr(self):
             "rcr_to_rrr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_rcr_to_rrr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 4, 8],
+            "bmm_rcr",
+            "bmm_rrr",
+            "rcr_to_rrr_float",
+            dtype="float",
+        )
+
     def test_rrr_to_crr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -408,6 +617,20 @@ def test_rrr_to_crr(self):
             "rrr_to_crr_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_rrr_to_crr_float_sm80(self):
+        B = [1]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 5, 3],
+            [batch_dim, 5, 7],
+            "bmm_rrr",
+            "bmm_crr",
+            "rrr_to_crr_need_align_float",
+            dtype="float",
+        )
+
     def test_rrr_to_rcr(self):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
@@ -429,7 +652,21 @@ def test_rrr_to_rcr(self):
             B, [batch_dim, 2, 4], [batch_dim, 8, 4], "bmm_rrr", "bmm_rcr", "rrr_to_rcr"
         )
 
-    def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_rrr_to_rcr_float_sm80(self):
+        B = [1, 3]
+        batch_dim = shape_utils.gen_int_var_min_max(B)
+        self._test_permute_bmm(
+            B,
+            [batch_dim, 2, 4],
+            [batch_dim, 8, 4],
+            "bmm_rrr",
+            "bmm_rcr",
+            "rrr_to_rcr_float",
+            dtype="float",
+        )
+
+    def _test_gemm_broadcast_rcr_to_ccr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -438,7 +675,8 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [8, 4],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_gemm_broadcast_b",
+            f"rcr_to_ccr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -447,7 +685,8 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [7, 5],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_need_align_gemm_broadcast_b",
+            f"rcr_to_ccr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -459,11 +698,12 @@ def _test_gemm_broadcast_rcr_to_ccr(self, test_bias):
             [8, 4],
             "gemm_rcr",
             "bmm_ccr",
-            "rcr_to_ccr_dynamic_gemm_broadcast_b",
+            f"rcr_to_ccr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
-    def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
+    def _test_gemm_broadcast_rcr_to_rrr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -472,7 +712,8 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [4, 8],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_gemm_broadcast_b",
+            f"rcr_to_rrr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -481,7 +722,8 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [5, 7],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_need_align_gemm_broadcast_b",
+            f"rcr_to_rrr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -493,11 +735,12 @@ def _test_gemm_broadcast_rcr_to_rrr(self, test_bias):
             [4, 8],
             "gemm_rcr",
             "bmm_rrr",
-            "rcr_to_rrr_dynamic_gemm_broadcast_b",
+            f"rcr_to_rrr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
-    def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
+    def _test_gemm_broadcast_rrr_to_crr(self, test_bias, dtype="float16"):
         B = [1]
         batch_dim = shape_utils.gen_int_var_min_max(B)
         self._test_permute_bmm(
@@ -506,7 +749,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [4, 8],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_gemm_broadcast_b",
+            f"rrr_to_crr_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
         self._test_permute_bmm(
@@ -515,7 +759,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [5, 7],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_need_align_gemm_broadcast_b",
+            f"rrr_to_crr_need_align_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[7] if test_bias else None,
         )
 
@@ -527,7 +772,8 @@ def _test_gemm_broadcast_rrr_to_crr(self, test_bias):
             [4, 8],
             "gemm_rrr",
             "bmm_crr",
-            "rrr_to_crr_dynamic_gemm_broadcast_b",
+            f"rrr_to_crr_dynamic_gemm_broadcast_b_{dtype}",
+            dtype,
             bias_shape=[8] if test_bias else None,
         )
 
@@ -539,12 +785,43 @@ def test_gemm_broadcast_rrr_to_crr(self):
         self._test_gemm_broadcast_rrr_to_crr(True)
         self._test_gemm_broadcast_rrr_to_crr(False)
 
-    def test_permute_multiple_consumer(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @parameterized.expand(
+        [
+            (
+                _test_gemm_broadcast_rcr_to_ccr,
+                True,
+                "float",
+            ),
+            (
+                _test_gemm_broadcast_rrr_to_crr,
+                False,
+                "float",
+            ),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_broadcast_float_sm80(self, func, test_bias, dtype):
+        func(self, test_bias, dtype)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_permute_multiple_consumer(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         A_shape = [2, 8, 4]
         B_shape = [2, 8, 8]
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
 
         permA = ops.permute021()(A)
 
@@ -555,14 +832,15 @@ def test_permute_multiple_consumer(self):
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "permute_multiple_consumer")
+        module = compile_model(
+            output, target, "./tmp", f"permute_multiple_consumer_{dtype}"
+        )
 
         graph = module.debug_sorted_graph
         bmm_tensors = 0
         for tensor in graph:
             src_ops = tensor.src_ops()
-            if len(src_ops) != 1:
+            if len(src_ops) != 2:
                 continue
             src_op = list(tensor.src_ops())[0]
             if src_op._attrs["op"].startswith("bmm"):
@@ -570,16 +848,16 @@ def test_permute_multiple_consumer(self):
                 self.assertEqual(src_op._attrs["op"], "bmm_crr")
         self.assertEqual(bmm_tensors, 1)
 
-        A_pt = torch.randn(*A_shape).cuda().half()
+        A_pt = get_random_torch_tensor(A_shape, dtype)
         AT_pt = A_pt.permute((0, 2, 1))
-        B1_pt = torch.randn(*B_shape).cuda().half()
+        B1_pt = get_random_torch_tensor(B_shape, dtype)
 
         C1_pt = torch.bmm(AT_pt, B1_pt)
         C2_pt = torch.cos(AT_pt)
 
         Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
 
-        y = torch.empty([4, 4, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 4, 8], dtype)
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0, 0]
         inputs[input_name_to_index["input_0"]] = A_pt
@@ -588,13 +866,25 @@ def test_permute_multiple_consumer(self):
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_permute_multiple_only_bmm_consumer(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_permute_multiple_only_bmm_consumer(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         A_shape = [2, 8, 4]
         B_shape = [2, 8, 8]
 
-        A = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        B1 = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        B2 = Tensor(shape=B_shape, dtype="float16", name="input_2", is_input=True)
+        A = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        B1 = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        B2 = Tensor(shape=B_shape, dtype=dtype, name="input_2", is_input=True)
 
         permA = ops.permute021()(A)
 
@@ -605,34 +895,35 @@ def test_permute_multiple_only_bmm_consumer(self):
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "permute_multiple_bmm_consumer")
+        module = compile_model(
+            output, target, "./tmp", f"permute_multiple_bmm_consumer_{dtype}"
+        )
 
         graph = module.debug_sorted_graph
         bmm_tensors = 0
         for tensor in graph:
             src_ops = tensor.src_ops()
-            if len(src_ops) != 1:
+            if len(src_ops) != 2:
                 continue
-            src_op = list(tensor.src_ops())[0]
-            # All permutes should've be gone.
-            self.assertFalse(src_op._attrs["op"].startswith("permute"))
-            if src_op._attrs["op"].startswith("bmm"):
-                bmm_tensors += 1
-                self.assertEqual(src_op._attrs["op"], "bmm_crr")
+            for src_op in list(tensor.src_ops()):
+                # All permutes should've be gone.
+                self.assertFalse(src_op._attrs["op"].startswith("permute"))
+                if src_op._attrs["op"].startswith("bmm"):
+                    bmm_tensors += 1
+                    self.assertEqual(src_op._attrs["op"], "bmm_crr")
         self.assertEqual(bmm_tensors, 2)
 
-        A_pt = torch.randn(*A_shape).cuda().half()
+        A_pt = get_random_torch_tensor(A_shape, dtype)
         AT_pt = A_pt.permute((0, 2, 1))
-        B1_pt = torch.randn(*B_shape).cuda().half()
-        B2_pt = torch.randn(*B_shape).cuda().half()
+        B1_pt = get_random_torch_tensor(B_shape, dtype)
+        B2_pt = get_random_torch_tensor(B_shape, dtype)
 
         C1_pt = torch.bmm(AT_pt, B1_pt)
         C2_pt = torch.bmm(AT_pt, B2_pt)
 
         Y_pt = torch.concat((C1_pt, C2_pt), dim=0)
 
-        y = torch.empty([4, 4, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 4, 8], dtype)
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0, 0, 0]
         inputs[input_name_to_index["input_0"]] = A_pt
@@ -643,5 +934,6 @@ def test_permute_multiple_only_bmm_consumer(self):
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(FusePermuteBmmCase)
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_permute_gemm.py b/tests/unittest/compiler/test_fuse_permute_gemm.py
index 4d2541bc8..96478fb59 100644
--- a/tests/unittest/compiler/test_fuse_permute_gemm.py
+++ b/tests/unittest/compiler/test_fuse_permute_gemm.py
@@ -20,38 +20,83 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class FusePermuteGemmTestCase(unittest.TestCase):
-    def test_no_fusion_odd_alignment(self):
-        x = Tensor([32, 51], is_input=True)
-        w = Tensor([32, 51], is_input=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_no_fusion_odd_alignment(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        x = Tensor([32, 51], dtype=dtype, is_input=True)
+        w = Tensor([32, 51], dtype=dtype, is_input=True)
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rrr()(w, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "z"
 
         module = compile_model(
-            z, detect_target(), "./tmp", "test_no_fusion_odd_alignment"
+            z, target, "./tmp", f"test_no_fusion_odd_alignment_{dtype}"
         )
-        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
-        self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
+        if dtype == "float":
+            self.assertFalse(
+                test_utils.graph_has_op(module.debug_sorted_graph, "permute")
+            )
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr")
+            )
+        elif dtype == "float16":
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "permute")
+            )
+            self.assertTrue(
+                test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr")
+            )
+        else:
+            raise RuntimeError("invalid {dtype=}")
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_gemm_rrr_to_rcr(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
-    def test_gemm_rrr_to_rcr(self):
-        x = Tensor([32, 52], is_input=True, name="x")
-        w = Tensor([32, 52], is_input=True, name="w")
+        x = Tensor([32, 52], dtype=dtype, is_input=True, name="x")
+        w = Tensor([32, 52], dtype=dtype, is_input=True, name="w")
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rrr()(w, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "z"
 
-        module = compile_model(z, detect_target(), "./tmp", "test_gemm_rrr_to_rcr")
+        module = compile_model(z, target, "./tmp", f"test_gemm_rrr_to_rcr_{dtype}")
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
         self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
 
-        x_pt = torch.randn(32, 52).half().cuda()
-        w_pt = torch.randn(32, 52).half().cuda()
+        x_pt = get_random_torch_tensor([32, 52], dtype)
+        w_pt = get_random_torch_tensor([32, 52], dtype)
         y_pt = x_pt.t()
         z_pt = torch.matmul(w_pt, y_pt)
         z_ait = torch.empty_like(z_pt)
@@ -59,9 +104,21 @@ def test_gemm_rrr_to_rcr(self):
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
 
-    def test_gemm_rcr_to_rrr(self):
-        x = Tensor([52, 32], is_input=True, name="x")
-        w = Tensor([32, 52], is_input=True, name="w")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_gemm_rcr_to_rrr(self, dtype):
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        x = Tensor([52, 32], dtype=dtype, is_input=True, name="x")
+        w = Tensor([32, 52], dtype=dtype, is_input=True, name="w")
         y = ops.permute()(x, dims=[1, 0])
         z = ops.gemm_rcr()(w, y)
         z._attrs["is_output"] = True
@@ -69,18 +126,22 @@ def test_gemm_rcr_to_rrr(self):
 
         module = compile_model(
             z,
-            detect_target(),
+            target,
             "./tmp",
-            "test_gemm_rcr_to_rrr",
+            f"test_gemm_rcr_to_rrr_{dtype}",
         )
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "permute"))
         self.assertFalse(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rcr"))
         self.assertTrue(test_utils.graph_has_op(module.debug_sorted_graph, "gemm_rrr"))
 
-        x_pt = torch.randn(52, 32).half().cuda()
-        w_pt = torch.randn(32, 52).half().cuda()
+        x_pt = get_random_torch_tensor([52, 32], dtype)
+        w_pt = get_random_torch_tensor([32, 52], dtype)
         z_pt = torch.matmul(w_pt, x_pt)
         z_ait = torch.empty_like(z_pt)
         module.run_with_tensors({"x": x_pt, "w": w_pt}, {"z": z_ait})
 
         torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fuse_split_cat.py b/tests/unittest/compiler/test_fuse_split_cat.py
new file mode 100644
index 000000000..5c5568d2d
--- /dev/null
+++ b/tests/unittest/compiler/test_fuse_split_cat.py
@@ -0,0 +1,321 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.public import IntImm
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+
+
+class FuseSplitCatTestCase(unittest.TestCase):
+    def _test_fuse_split_cat_rearrange(self, M, N, split, remove_split=True):
+        dtype = "float16"
+        M = IntImm(M)
+        N = IntImm(N)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, split, 0)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 0)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertEqual(
+            graph_has_op(model.debug_sorted_graph, "split"), not remove_split
+        )
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, split, 0)
+        y_pt = torch.cat(
+            [split_pt[1], split_pt[0]],
+            0,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_even(self):
+        self._test_fuse_split_cat_rearrange(
+            512, 512, split=[256, 256], remove_split=True
+        )
+
+    def test_fuse_split_cat_odd(self):
+        self._test_fuse_split_cat_rearrange(
+            512, 512, split=[139, 373], remove_split=True
+        )
+
+    def test_fuse_split_cat_reuse(self):
+        """Use a split output twice in the concatenate op."""
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(M.value() / 2), 0)
+        concatenate_3 = ops.concatenate()([split_2[1], split_2[0], split_2[1]], 0)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(M.value() / 2), 0)
+        y_pt = torch.cat(
+            [split_pt[1], split_pt[0], split_pt[1]],
+            0,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_dim1(self):
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(N.value() / 2), 1)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 1)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(N.value() / 2), 1)
+        y_pt = torch.cat(
+            split_pt[::-1],
+            1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_different_dims(self):
+        """Splitting and then concatting on different dims is not
+        expected to be optimized currently."""
+        dtype = "float16"
+        M = IntImm(512)
+        N = IntImm(512)
+
+        input_1 = Tensor(
+            shape=[M, N],
+            name="input_1",
+            is_input=True,
+        )
+        split_2 = ops.split()(input_1, int(M.value() / 2), 0)
+        concatenate_3 = ops.concatenate()(split_2[::-1], 1)
+
+        # Set outputs
+        concatenate_3._attrs["name"] = "output_0"
+        concatenate_3._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            concatenate_3, detect_target(), "./tmp", self._testMethodName
+        )
+        # Check that split was not removed because the dims are different
+        self.assertTrue(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        input_1 = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+        # Compare
+        split_pt = torch.split(input_1, int(M.value() / 2), 0)
+        y_pt = torch.cat(
+            split_pt[::-1],
+            1,
+        )
+        y_ait = torch.empty_like(y_pt)
+        model.run_with_tensors(
+            {"input_1": input_1},
+            [y_ait],
+        )
+        torch.testing.assert_close(y_ait, y_pt, atol=0, rtol=0)
+
+    def test_fuse_split_cat_bmm(self):
+        """Optimize out a split op whose output is used by both concat and bmm."""
+        dtype = "float16"
+        B = 1
+        M = 128
+        N = 512
+        K = 512
+        split_size_or_sections = 256
+        split_dim = 2
+        T_A = Tensor(
+            # feed the second half of T_A into additional concat so that the split
+            # output is used by both bmm and concat
+            shape=[B, M, K * 2],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        T_B = Tensor(
+            shape=[B, N, K],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+
+        Xs = ops.split()(T_A, split_size_or_sections, split_dim)
+        Ys = ops.split()(T_B, split_size_or_sections, split_dim)
+        assert len(Xs) // 2 == len(Ys)
+
+        n = 2
+        Cs = []
+        for i in range(n):
+            X = Xs[i]
+            Y = Ys[i]
+            C = ops.bmm_rcr()(X, Y)
+            Cs.append(C)
+        # do an extra concatenate so that split_1 has different output ops
+        extra_concat = ops.concatenate()([Xs[3], Xs[2], Xs[3], Xs[2]], dim=split_dim)
+        bmm_cat = ops.concatenate()(Cs, dim=split_dim)
+        Y = ops.elementwise(FuncEnum.ADD)(extra_concat, bmm_cat)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        a = get_random_torch_tensor([B, M, K * 2], dtype)
+        b = get_random_torch_tensor([B, N, K], dtype)
+        xs = a.split(split_size_or_sections, split_dim)
+        ys = b.split(split_size_or_sections, split_dim)
+        cs = []
+        for i in range(n):
+            x = xs[i]
+            y = ys[i]
+            c = torch.bmm(x, y.permute(0, 2, 1))
+            cs.append(c)
+        extra_concat_pt = torch.cat([xs[3], xs[2], xs[3], xs[2]], dim=split_dim)
+        bmm_cat_pt = torch.cat(cs, dim=split_dim)
+        y_pt = torch.add(extra_concat_pt, bmm_cat_pt)
+
+        # Gen module.
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", self._testMethodName)
+        # Both splits should be removed, including the split that is used by
+        # both bmm and concat
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        self.assertEqual(len(model.debug_sorted_graph), 5)
+        y = torch.empty_like(y_pt)
+        model.run_with_tensors({"input0": a, "input1": b}, [y])
+        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_fuse_split_cat_interleaved(self):
+        # make a graph like below:
+        # slice_0 = slice(x0)
+        # slice_1 = slice(x0)
+        # split_0_0, split_0_1 = split(slice_0, [10, 10], 1)
+        # split_1_0, split_1_1 = split(slice_1, [10, 10], 1)
+        # y = cat([split_0_0, split_1_0, split_0_1, split_1_1], 1)
+
+        dtype = "float16"
+        M = IntImm(20)
+        N = IntImm(60)
+
+        X0 = Tensor(
+            shape=[M, N],
+            name="x0",
+            is_input=True,
+        )
+        slice_start_indices_0 = [0, 0]
+        slice_end_indices_0 = [None, 20]
+        dynamic_slice_0 = ops.dynamic_slice()(
+            X0, slice_start_indices_0, slice_end_indices_0
+        )
+        slice_start_indices_1 = [0, 20]
+        slice_end_indices_1 = [None, 40]
+        dynamic_slice_1 = ops.dynamic_slice()(
+            X0, slice_start_indices_1, slice_end_indices_1
+        )
+        split_0_0, split_0_1 = ops.split()(dynamic_slice_0, [10, 10], 1)
+        split_1_0, split_1_1 = ops.split()(dynamic_slice_1, [10, 10], 1)
+        Y = ops.concatenate()([split_0_0, split_1_0, split_0_1, split_1_1], 1)
+
+        # Set outputs
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+        # Compile
+        model = compile_model(
+            Y, detect_target(), "./tmp", "test_fuse_split_cat_interleaved"
+        )
+        # Check that split was removed
+        self.assertFalse(graph_has_op(model.debug_sorted_graph, "split"))
+        # Run
+        x0_pt = get_random_torch_tensor((M.value(), N.value()), dtype=dtype)
+
+        # Compare
+        slice_indices_0 = [
+            slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+        ]
+        dynamic_slice_0_pt = x0_pt[slice_indices_0]
+        slice_indices_1 = [
+            slice(i, j) for i, j in zip(slice_start_indices_1, slice_end_indices_1)
+        ]
+        dynamic_slice_1_pt = x0_pt[slice_indices_1]
+        split_0_0_pt, split_0_1_pt = torch.split(dynamic_slice_0_pt, [10, 10], 1)
+        split_1_0_pt, split_1_1_pt = torch.split(dynamic_slice_1_pt, [10, 10], 1)
+        y_pt = torch.cat([split_0_0_pt, split_1_0_pt, split_0_1_pt, split_1_1_pt], 1)
+        y = torch.empty_like(y_pt)
+        model.run_with_tensors({"x0": x0_pt}, [y])
+        torch.testing.assert_close(y, y_pt, atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
index 73cd581ce..4ef635e7e 100644
--- a/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
+++ b/tests/unittest/compiler/test_fused_elementwise_complex_dependency.py
@@ -23,12 +23,21 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import graph_utils
+
+from parameterized import parameterized
 from torch import nn
 
 
 class FusedElementwiseComplexDependencyTestCase(unittest.TestCase):
-    def test_fused_elementwise_direct_input_dependency(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_direct_input_dependency(self, dtype):
         r"""
             X0   X1
              \   /
@@ -40,24 +49,27 @@ def test_fused_elementwise_direct_input_dependency(self):
 
         Add_1, Add_2, and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -68,25 +80,24 @@ def test_fused_elementwise_direct_input_dependency(self):
         R2._attrs["name"] = "R2"
         R2._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R2,
             target,
             "./tmp",
-            "fused_elementwise_direct_input_dependency",
+            f"fused_elementwise_direct_input_dependency_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
 
-        x0_pt = torch.rand(M, N).cuda().half()
-        x2_pt = torch.rand(M, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
 
         r0_pt = x0_pt + 3 + x2_pt
         r1_pt = r0_pt + x2_pt
         r2_pt = r0_pt - r1_pt
 
-        r2 = torch.empty([M, N]).cuda().half()
+        r2 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -99,7 +110,97 @@ def test_fused_elementwise_direct_input_dependency(self):
         module.run_with_tensors(inputs, [r2])
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_non_elementwise_ops(self):
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_direct_input_dependency_split_subgraph(self, dtype):
+        r"""
+        X3[K,N]   X0[N]   X1[]
+           |         \   /
+           |     Add_1[N]  X2[M,N]
+            \      /  |  \    /
+             Add[K,N] |  Add_2[M, N]
+                       \     /
+                       Sub_1 [M,N]
+
+           Add_1, Add_2, and Sub_1 should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
+
+        M = 10
+        N = 4
+        K = 15
+        X0 = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.elementwise(FuncEnum.ADD)(R0, X2)
+        R2 = ops.elementwise(FuncEnum.SUB)(R0, R1)
+        R3 = ops.elementwise(FuncEnum.ADD)(R0, X3)
+        R2._attrs["name"] = "R2"
+        R2._attrs["is_output"] = True
+        R3._attrs["name"] = "R3"
+        R3._attrs["is_output"] = True
+
+        module = compile_model(
+            [R3, R2],
+            target,
+            "./tmp",
+            f"fused_elementwise_direct_input_dependency_split_subgraph{dtype}",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+
+        x0_pt = get_random_torch_tensor([N], dtype)  # N
+        x2_pt = get_random_torch_tensor([M, N], dtype)
+        x3_pt = get_random_torch_tensor([K, N], dtype)
+
+        r0_pt = x0_pt + 3
+        r3_pt = r0_pt + x3_pt
+        r1_pt = r0_pt + x2_pt
+        r2_pt = r0_pt - r1_pt
+
+        r2 = get_torch_empty_tensor([M, N], dtype)
+        r3 = get_torch_empty_tensor([K, N], dtype)  # N
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r3, r2])
+        self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_fused_elementwise_non_elementwise_ops(self, dtype):
         r"""
                 X0   X1 (3)
                  \   /
@@ -114,24 +215,27 @@ def test_fused_elementwise_non_elementwise_ops(self):
 
             Add_1, Add_2, and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
 
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -148,19 +252,18 @@ def test_fused_elementwise_non_elementwise_ops(self):
         R4._attrs["name"] = "R4"
         R4._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [R1, R2, R4],
             target,
             "./tmp",
-            "test_fused_elementwise_non_elementwise_ops",
+            f"test_fused_elementwise_non_elementwise_ops_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 4)
 
-        x0_pt = torch.rand(M, N).cuda().half()
-        x2_pt = torch.rand(M, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, N], dtype)
+        x2_pt = get_random_torch_tensor([M, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = r0_pt + x2_pt
@@ -168,9 +271,9 @@ def test_fused_elementwise_non_elementwise_ops(self):
         r3_pt = r0_pt.reshape([-1])
         r4_pt = r3_pt + r3_pt
 
-        r1 = torch.empty(r1_pt.shape).cuda().half()
-        r2 = torch.empty([M, N]).cuda().half()
-        r4 = torch.empty(r4_pt.shape).cuda().half()
+        r1 = get_torch_empty_tensor(r1_pt.shape, dtype)
+        r2 = get_torch_empty_tensor([M, N], dtype)
+        r4 = get_torch_empty_tensor(r4_pt.shape, dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -185,7 +288,15 @@ def test_fused_elementwise_non_elementwise_ops(self):
         self.assertTrue(torch.allclose(r2, r2_pt, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_indirect_input_dependency(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fused_elementwise_indirect_input_dependency(self, dtype):
         r"""
             X0   X1
              \   /
@@ -199,25 +310,28 @@ def test_fused_elementwise_indirect_input_dependency(self):
 
         Tanh_1 and Sub_1 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -229,26 +343,25 @@ def test_fused_elementwise_indirect_input_dependency(self):
         R3._attrs["name"] = "R3"
         R3._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R3,
             target,
             "./tmp",
-            "fused_elementwise_indirect_input_dependency",
+            f"fused_elementwise_indirect_input_dependency_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 3)
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x2_pt)
         r2_pt = torch.tanh(r1_pt)
         r3_pt = r0_pt - r2_pt
 
-        r3 = torch.empty([M, N]).cuda().half()
+        r3 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -261,7 +374,117 @@ def test_fused_elementwise_indirect_input_dependency(self):
         module.run_with_tensors(inputs, [r3])
         self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_multi_dependency(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fused_elementwise_indirect_input_dependency_split_subgraph(self, dtype):
+        r"""
+                X0[M,K] X1[]
+                 \      /
+                  Add_1      X2[K,N]
+                   |    \      /
+                   |     Gemm_1
+                   |        |
+        X3[P,M,N]  |      Tanh_1 (output)
+              \    |           |
+                Sub_1          |
+                   |          /
+                Sub_2 (output)
+            Tanh_1 and Sub_1 should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        M = 10
+        K = 4
+        N = 4
+        P = 15
+        X0 = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="X1",
+            value=3.0,
+        )
+        X2 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[P, M, N],
+            dtype=dtype,
+            name="X3",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        R1 = ops.gemm_rcr()(R0, X2)
+        R2 = ops.elementwise(FuncEnum.TANH)(R1)
+        R3 = ops.elementwise(FuncEnum.SUB)(X3, R0)
+        R4 = ops.elementwise(FuncEnum.SUB)(R3, R2)
+        R3._attrs["name"] = "R3"
+        R3._attrs["is_output"] = True
+        R4._attrs["name"] = "R4"
+        R4._attrs["is_output"] = True
+
+        module = compile_model(
+            [R3, R4],
+            target,
+            "./tmp",
+            f"fused_elementwise_indirect_input_dependency_split_subgraph{dtype}",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 4)
+
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
+        x3_pt = get_random_torch_tensor([P, M, N], dtype)
+
+        r0_pt = x0_pt + 3
+        r1_pt = nn.functional.linear(r0_pt, x2_pt)
+        r2_pt = torch.tanh(r1_pt)
+        r3_pt = x3_pt - r0_pt
+        r4_pt = r3_pt - r2_pt
+
+        r3 = get_torch_empty_tensor([P, M, N], dtype)
+        r4 = get_torch_empty_tensor([P, M, N], dtype)
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X2": x2_pt,
+            "X3": x3_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r3, r4])
+        self.assertTrue(torch.allclose(r3, r3_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(r4, r4_pt, atol=1e-2, rtol=1e-2))
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fused_elementwise_multi_dependency(self, dtype):
         r"""
             X0   X1                X3
              \   /                 |
@@ -278,37 +501,40 @@ def test_fused_elementwise_multi_dependency(self):
 
         Tanh_1, Sub_1, Sub_2 and Add_2 should be fused together.
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
         X3 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X3",
             is_input=True,
         )
         X4 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X4",
             is_input=True,
         )
@@ -335,10 +561,10 @@ def test_fused_elementwise_multi_dependency(self):
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 5)
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(K, N).cuda().half()
-        x3_pt = torch.rand(M, K).cuda().half()
-        x4_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([K, N], dtype)
+        x3_pt = get_random_torch_tensor([M, K], dtype)
+        x4_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x2_pt)
@@ -349,7 +575,7 @@ def test_fused_elementwise_multi_dependency(self):
         r6_pt = r4_pt - r5_pt
         r7_pt = r6_pt + r3_pt
 
-        r7 = torch.empty([M, N]).cuda().half()
+        r7 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
@@ -364,6 +590,93 @@ def test_fused_elementwise_multi_dependency(self):
         module.run_with_tensors(inputs, [r7])
         self.assertTrue(torch.allclose(r7, r7_pt, atol=1e-2, rtol=1e-2))
 
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fused_elementwise_find_fusable_graph(self, dtype):
+        r"""
+                     X0
+                     |
+                    Abs
+                   /   \
+            X1   Tanh  |
+             \    /    |
+              Gemm   Relu
+                \      |
+                 Exp   |
+                   \  /
+                   Sub
+
+        Tanh, Abs, Relu should be fused together;  Sub, Exp should be fused together.
+        """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
+        M = 10
+        K = 4
+        N = 4
+        X0 = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="X0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[K, N],
+            dtype=dtype,
+            name="X1",
+            is_input=True,
+        )
+
+        R0 = ops.elementwise(FuncEnum.ABS)(X0)
+        R1 = ops.elementwise(FuncEnum.TANH)(R0)
+        R2 = ops.gemm_rcr()(R1, X1)
+        R3 = ops.elementwise(FuncEnum.EXP)(R2)
+        R4 = ops.elementwise(FuncEnum.RELU)(R0)
+        R5 = ops.elementwise(FuncEnum.SUB)(R4, R3)
+        R5._attrs["name"] = "R5"
+        R5._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            R5,
+            target,
+            "./tmp",
+            "fused_elementwise_find_fusable_graph",
+        )
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x1_pt = get_random_torch_tensor([K, N], dtype)
+        relu = torch.nn.ReLU()
+        r0_pt = torch.abs(x0_pt)
+        r1_pt = torch.tanh(r0_pt)
+        r2_pt = nn.functional.linear(r1_pt, x1_pt)
+        r3_pt = torch.exp(r2_pt)
+        r4_pt = relu(r0_pt)
+        r5_pt = r4_pt - r3_pt
+
+        r5 = get_torch_empty_tensor([M, N], dtype)
+
+        input_name_to_idx_mapping = module.get_input_name_to_index_map()
+        inputs = [None] * len(input_name_to_idx_mapping)
+        input_name_to_pt_mapping = {
+            "X0": x0_pt,
+            "X1": x1_pt,
+        }
+        for input_name, pt in input_name_to_pt_mapping.items():
+            inputs[input_name_to_idx_mapping[input_name]] = pt
+        module.run_with_tensors(inputs, [r5])
+        self.assertTrue(torch.allclose(r5, r5_pt, atol=1e-2, rtol=1e-2))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
index f7513b42b..011226c67 100644
--- a/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
+++ b/tests/unittest/compiler/test_fused_elementwise_out_of_order.py
@@ -23,11 +23,27 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 from torch import nn
 
 
 class FusedElementwiseOutOfOrderTestCase(unittest.TestCase):
-    def test_fused_elementwise_out_of_order(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_fused_elementwise_out_of_order(self, dtype):
         r"""
            X0   X1
             \   /
@@ -48,37 +64,40 @@ def test_fused_elementwise_out_of_order(self):
         New order:
         [X2, X4, R2, X0, X1, R0, X3, R1, R3, R4, R5]
         """
+        target = detect_target()
+        if dtype == "float" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
 
         M = 10
         K = 4
         N = 4
         X0 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             value=3.0,
         )
         X2 = Tensor(
             shape=[M, K],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
         X3 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X3",
             is_input=True,
         )
         X4 = Tensor(
             shape=[K, N],
-            dtype="float16",
+            dtype=dtype,
             name="X4",
             is_input=True,
         )
@@ -92,18 +111,17 @@ def test_fused_elementwise_out_of_order(self):
         R5._attrs["name"] = "R5"
         R5._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             R5,
             target,
             "./tmp",
-            "fused_elementwise_out_of_order",
+            f"fused_elementwise_out_of_order_{dtype}",
         )
 
-        x0_pt = torch.rand(M, K).cuda().half()
-        x2_pt = torch.rand(M, K).cuda().half()
-        x3_pt = torch.rand(K, N).cuda().half()
-        x4_pt = torch.rand(K, N).cuda().half()
+        x0_pt = get_random_torch_tensor([M, K], dtype)
+        x2_pt = get_random_torch_tensor([M, K], dtype)
+        x3_pt = get_random_torch_tensor([K, N], dtype)
+        x4_pt = get_random_torch_tensor([K, N], dtype)
 
         r0_pt = x0_pt + 3
         r1_pt = nn.functional.linear(r0_pt, x3_pt)
@@ -112,7 +130,7 @@ def test_fused_elementwise_out_of_order(self):
         r4_pt = nn.functional.linear(r3_pt, x4_pt)
         r5_pt = r1_pt - r4_pt
 
-        r5 = torch.empty([M, N]).cuda().half()
+        r5 = get_torch_empty_tensor([M, N], dtype)
 
         input_name_to_idx_mapping = module.get_input_name_to_index_map()
         inputs = [None] * len(input_name_to_idx_mapping)
diff --git a/tests/unittest/compiler/test_fused_elementwise_singleton.py b/tests/unittest/compiler/test_fused_elementwise_singleton.py
new file mode 100644
index 000000000..4c0fc241c
--- /dev/null
+++ b/tests/unittest/compiler/test_fused_elementwise_singleton.py
@@ -0,0 +1,64 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+from aitemplate import compiler
+
+from aitemplate.compiler import ops
+
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.fuse_ops import (
+    fuse_elementwise,
+    process_singleton_elementwise,
+)
+from aitemplate.testing import detect_target
+
+
+def _make_graph():
+    X0 = Tensor(
+        shape=[3, 5, 7, 9],
+        dtype="float16",
+        name="X0",
+        is_input=True,
+    )
+
+    Y = ops.elementwise(FuncEnum.ABS)(ops.elementwise(FuncEnum.SIN)(X0))
+
+    Y._attrs["is_output"] = True
+    Y._attrs["name"] = "Y"
+    return Y
+
+
+class FusedElementwiseSingletonTestCase(unittest.TestCase):
+    def test_singleton_elementwise(self):
+        Y = _make_graph()
+
+        with detect_target():
+            graph = compiler.transform.toposort(Y)
+            compiler.transform.name_graph(graph)
+            g1 = process_singleton_elementwise(graph)
+
+        self.assertEqual(3, len(g1))  # x, sin(x), abs(sin(x))
+
+    def test_fused_elementwise(self):
+        Y = _make_graph()
+
+        with detect_target():
+            graph = compiler.transform.toposort(Y)
+            compiler.transform.name_graph(graph)
+            g1 = fuse_elementwise(graph)
+
+        self.assertEqual(2, len(g1))  # x, abs(sin(x))
diff --git a/tests/unittest/compiler/test_group_fusions.py b/tests/unittest/compiler/test_group_fusions.py
index a4b78bd48..6a11e9661 100644
--- a/tests/unittest/compiler/test_group_fusions.py
+++ b/tests/unittest/compiler/test_group_fusions.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,30 +20,41 @@
 from aitemplate import compiler
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import count_ops, has_op
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    count_ops,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
+from aitemplate.utils import graph_utils
 
 
-def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True, only_params=False):
+_LOGGER = logging.getLogger(__name__)
+
+
+def _prepare_input_tensors(
+    m, nk_groups, dtype, start=0, has_bias=True, only_params=False
+):
     inputs = []
     for i, (n, k) in enumerate(nk_groups):
         X = Tensor(
             shape=[m, k],
-            dtype="float16",
+            dtype=dtype,
             name="x_{}".format(i + start),
             is_input=True,
         )
         W = Tensor(
             shape=[n, k],
-            dtype="float16",
+            dtype=dtype,
             name="w_{}".format(i + start),
             is_input=True,
         )
         B = Tensor(
             shape=[n],
-            dtype="float16",
+            dtype=dtype,
             name="b_{}".format(i + start),
             is_input=True,
         )
@@ -74,6 +86,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         fuse_sigmoid_mul=True,
         num_group_ops=1,
         should_fail=False,
+        dtype="float16",
     ):
         if gamma_is_none or beta_is_none or len(input_shapes) <= 1:
             should_fail = True
@@ -82,8 +95,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
             if fuse_sigmoid_mul
             else "group_layernorm_fusion"
         )
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"{testname}: input_shapes={input_shapes}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}",
         )
@@ -98,7 +110,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                         IntImm(shape[0]),
                         IntImm(shape[1]),
                     ],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -108,7 +120,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -119,7 +131,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -172,8 +184,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
 
         B = len(input_shapes)
 
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Run test group_layernorm_sigmoid_mul. Input shapes: {input_shapes}",
         )
 
@@ -181,10 +192,14 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         gammas_pt = []
         betas_pt = []
         for shape in input_shapes:
-            xs_pt.append(torch.randn(shape).cuda().half())
-            gamma_pt = None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+            xs_pt.append(get_random_torch_tensor(shape, dtype))
+            gamma_pt = (
+                None if gamma_is_none else get_random_torch_tensor([shape[1]], dtype)
+            )
             gammas_pt.append(gamma_pt)
-            beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+            beta_pt = (
+                None if beta_is_none else get_random_torch_tensor([shape[1]], dtype)
+            )
             betas_pt.append(beta_pt)
 
         ys_pt = []
@@ -209,7 +224,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
         ys = []
         for y_pt in ys_pt:
-            ys.append(torch.empty(y_pt.size()).cuda().half())
+            ys.append(get_torch_empty_tensor(y_pt.size(), dtype))
         module.run_with_tensors(inputs, ys)
         # module.benchmark_with_tensors(inputs, ys)
         for y_pt, y in zip(ys_pt, ys):
@@ -218,7 +233,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 f"max diff: {torch.max(y_pt - y)}, min diff: {torch.min(y_pt - y)}",
             )
 
-    def test_group_layernorm_sigmoid_mul_fusion(self):
+    def test_group_layernorm_sigmoid_mul_fusion_float16(self):
         self._test_group_layernorm_sigmoid_mul_cat_fusion(
             [[128, 256]], fuse_sigmoid_mul=True
         )
@@ -295,6 +310,284 @@ def test_group_layernorm_sigmoid_mul_fusion(self):
             num_group_ops=1,
         )
 
+    def test_group_layernorm_sigmoid_mul_fusion_float32(self):
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4, fuse_sigmoid_mul=True, dtype="float32"
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[10, 64], [10, 64], [10, 64]],
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 256]] * 4,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[64, 64], [128, 256], [1, 125]],
+            fuse_sigmoid_mul=True,
+            should_fail=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125]],
+            fuse_sigmoid_mul=True,
+            add_size_op=True,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 128], [128, 256], [128, 125], [128, 125]],
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+            dtype="float32",
+        )
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [[128, 64]] * 39 + [[128, 256]] * 10,
+            fuse_sigmoid_mul=True,
+            num_group_ops=2,
+            dtype="float32",
+        )
+
+        # ctr_mbl_feed overarch cases
+        self._test_group_layernorm_sigmoid_mul_cat_fusion(
+            [
+                [2048, 256],
+                [2048, 256],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 128],
+                [2048, 1024],
+            ],
+            fuse_sigmoid_mul=True,
+            num_group_ops=1,
+            dtype="float32",
+        )
+
+    def test_layernorm_with_cycles(self):
+        """
+        The test basically forms the following subgraph:
+
+        layernorm_sigmoid_mul_1 = layernorm_sigmoid_mul(...)
+        gemm_rcr_2 = gemm_rcr(layernorm_sigmoid_mul_1)
+        layernorm_3 = layernorm(gemm_rcr_2)
+        layernorm_4 = layernorm(...)
+        gemm_rcr_5 = gemm_rcr(layernorm_4)
+        layernorm_sigmoid_mul_6 = layernorm_sigmoid_mul(gemm_rcr_5)
+
+        For example, grouping (layernorm_sigmoid_mul_1, layernorm_sigmoid_mul_6)
+        and (gemm_rcr_2, gemm_rcr_5) at the same time would introduce a cycle
+        between the fused group ops, because we have the following dependency:
+            layernorm_sigmoid_mul_1 -> gemm_rcr_2
+            gemm_rcr_5 -> layernrom_sigmoid_mul_6
+        """
+        torch.manual_seed(0)
+        testname = "layernorm_with_cycles_0"
+        dtype = "float16"
+        batch_sizes = [1, 2048]
+        eps = 1e-5
+
+        Input0 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=1024)],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        reshape_to_shape_0 = [-1, 32, 32]
+        reshape_0 = ops.reshape()(Input0, reshape_to_shape_0)
+
+        W0 = Tensor(shape=[IntImm(16), IntImm(32)], name="w0", is_input=True)
+        gemm_rcr_0 = ops.gemm_rcr()(reshape_0, W0)
+
+        reshape_to_shape_1 = [-1, 512]
+        reshape_1 = ops.reshape()(gemm_rcr_0, reshape_to_shape_1)
+
+        Input1 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=512)],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        elementwise_0 = ops.elementwise(func_enum=FuncEnum.MUL)(reshape_1, Input1)
+
+        W1 = Tensor(shape=[IntImm(3821), IntImm(512)], name="w1", is_input=True)
+        gemm_rcr_1 = ops.gemm_rcr()(elementwise_0, W1)
+
+        concat_dim = 1
+        concatenate_0 = ops.concatenate()([Input1, gemm_rcr_1], concat_dim)
+
+        Gamma0 = Tensor(shape=[IntImm(4333)], name="gamma0", is_input=True)
+        Beta0 = Tensor(shape=[IntImm(4333)], name="beta0", is_input=True)
+        layernorm_0 = ops.layernorm(normalized_shape=None)(
+            concatenate_0, Gamma0, Beta0, [IntImm(4333)], eps
+        )
+
+        Input2 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=256)],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        W2 = Tensor(shape=[IntImm(256), IntImm(256)], name="w2", is_input=True)
+        gemm_rcr_2 = ops.gemm_rcr()(Input2, W2)
+
+        Gamma1 = Tensor(shape=[IntImm(256)], name="gamma1", is_input=True)
+        Beta1 = Tensor(shape=[IntImm(256)], name="beta1", is_input=True)
+        layernorm_1 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_2, Gamma1, Beta1, [IntImm(256)], eps
+        )
+        elementwise_1 = ops.elementwise(func_enum=FuncEnum.SIGMOID)(layernorm_1)
+        elementwise_2 = ops.elementwise(func_enum=FuncEnum.MUL)(
+            gemm_rcr_2, elementwise_1
+        )
+
+        W3 = Tensor(shape=[IntImm(2048), IntImm(256)], name="w3", is_input=True)
+        gemm_rcr_3 = ops.gemm_rcr()(elementwise_2, W3)
+
+        Gamma2 = Tensor(shape=[IntImm(2048)], name="gamma2", is_input=True)
+        Beta2 = Tensor(shape=[IntImm(2048)], name="beta2", is_input=True)
+        layernorm_2 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_3, Gamma2, Beta2, [IntImm(2048)], eps
+        )
+
+        Input3 = Tensor(
+            shape=[IntVar(values=batch_sizes, name="batch"), IntImm(value=1320)],
+            dtype=dtype,
+            name="input3",
+            is_input=True,
+        )
+        Gamma3 = Tensor(shape=[IntImm(1320)], name="gamma3", is_input=True)
+        Beta3 = Tensor(shape=[IntImm(1320)], name="beta3", is_input=True)
+        layernorm_3 = ops.layernorm(normalized_shape=None)(
+            Input3, Gamma3, Beta3, [IntImm(1320)], eps
+        )
+
+        W4 = Tensor(shape=[IntImm(128), IntImm(1320)], name="w4", is_input=True)
+        gemm_rcr_4 = ops.gemm_rcr()(layernorm_3, W4)
+
+        Gamma4 = Tensor(shape=[IntImm(128)], name="gamma4", is_input=True)
+        Beta4 = Tensor(shape=[IntImm(128)], name="beta4", is_input=True)
+        layernorm_4 = ops.layernorm(normalized_shape=None)(
+            gemm_rcr_4, Gamma4, Beta4, [IntImm(128)], eps
+        )
+        elementwise_3 = ops.elementwise(func_enum=FuncEnum.SIGMOID)(layernorm_4)
+        elementwise_4 = ops.elementwise(func_enum=FuncEnum.MUL)(
+            gemm_rcr_4, elementwise_3
+        )
+
+        output_0 = ops.concatenate()(
+            [elementwise_4, layernorm_3, layernorm_0, layernorm_2], concat_dim
+        )
+        output_0._attrs["name"] = "output_0"
+        output_0._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(
+            [output_0],
+            target,
+            "./tmp",
+            testname,
+        )
+
+        for batch in batch_sizes:
+            input0_pt = get_random_torch_tensor([batch, 1024], dtype)
+            reshape_0_pt = torch.reshape(input0_pt, reshape_to_shape_0)
+
+            w0_pt = get_random_torch_tensor([16, 32], dtype)
+            gemm_rcr_0_pt = torch.nn.functional.linear(reshape_0_pt, w0_pt)
+
+            reshape_1_pt = torch.reshape(gemm_rcr_0_pt, reshape_to_shape_1)
+
+            input1_pt = get_random_torch_tensor([batch, 512], dtype)
+            elementwise_0_pt = reshape_1_pt * input1_pt
+
+            w1_pt = get_random_torch_tensor([3821, 512], dtype)
+            gemm_rcr_1_pt = torch.nn.functional.linear(elementwise_0_pt, w1_pt)
+
+            concatenate_0_pt = torch.cat([input1_pt, gemm_rcr_1_pt], concat_dim)
+
+            gamma0_pt = get_random_torch_tensor([4333], dtype)
+            beta0_pt = get_random_torch_tensor([4333], dtype)
+            layernorm_0_pt = torch.nn.functional.layer_norm(
+                concatenate_0_pt,
+                concatenate_0_pt.size()[1:],
+                gamma0_pt,
+                beta0_pt,
+                eps=eps,
+            )
+
+            input2_pt = get_random_torch_tensor([batch, 256], dtype)
+            w2_pt = get_random_torch_tensor([256, 256], dtype)
+            gemm_rcr_2_pt = torch.nn.functional.linear(input2_pt, w2_pt)
+
+            gamma1_pt = get_random_torch_tensor([256], dtype)
+            beta1_pt = get_random_torch_tensor([256], dtype)
+            layernorm_1_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_2_pt, gemm_rcr_2_pt.size()[1:], gamma1_pt, beta1_pt, eps=eps
+            )
+            elementwise_1_pt = torch.sigmoid(layernorm_1_pt)
+            elementwise_2_pt = torch.mul(gemm_rcr_2_pt, elementwise_1_pt)
+
+            w3_pt = get_random_torch_tensor([2048, 256], dtype)
+            gemm_rcr_3_pt = torch.nn.functional.linear(elementwise_2_pt, w3_pt)
+
+            gamma2_pt = get_random_torch_tensor([2048], dtype)
+            beta2_pt = get_random_torch_tensor([2048], dtype)
+            layernorm_2_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_3_pt, gemm_rcr_3_pt.size()[1:], gamma2_pt, beta2_pt, eps=eps
+            )
+
+            input3_pt = get_random_torch_tensor([batch, 1320], dtype)
+            gamma3_pt = get_random_torch_tensor([1320], dtype)
+            beta3_pt = get_random_torch_tensor([1320], dtype)
+            layernorm_3_pt = torch.nn.functional.layer_norm(
+                input3_pt, input3_pt.size()[1:], gamma3_pt, beta3_pt, eps=eps
+            )
+
+            w4_pt = get_random_torch_tensor([128, 1320], dtype)
+            gemm_rcr_4_pt = torch.nn.functional.linear(layernorm_3_pt, w4_pt)
+
+            gamma4_pt = get_random_torch_tensor([128], dtype)
+            beta4_pt = get_random_torch_tensor([128], dtype)
+            layernorm_4_pt = torch.nn.functional.layer_norm(
+                gemm_rcr_4_pt, gemm_rcr_4_pt.size()[1:], gamma4_pt, beta4_pt, eps=eps
+            )
+            elementwise_3_pt = torch.sigmoid(layernorm_4_pt)
+            elementwise_4_pt = torch.mul(gemm_rcr_4_pt, elementwise_3_pt)
+
+            output_0_pt = torch.cat(
+                [elementwise_4_pt, layernorm_3_pt, layernorm_0_pt, layernorm_2_pt],
+                concat_dim,
+            )
+
+            inputs = {
+                "input0": input0_pt,
+                "input1": input1_pt,
+                "input2": input2_pt,
+                "input3": input3_pt,
+                "w0": w0_pt,
+                "w1": w1_pt,
+                "w2": w2_pt,
+                "w3": w3_pt,
+                "w4": w4_pt,
+                "gamma0": gamma0_pt,
+                "beta0": beta0_pt,
+                "gamma1": gamma1_pt,
+                "beta1": beta1_pt,
+                "gamma2": gamma2_pt,
+                "beta2": beta2_pt,
+                "gamma3": gamma3_pt,
+                "beta3": beta3_pt,
+                "gamma4": gamma4_pt,
+                "beta4": beta4_pt,
+            }
+            y = torch.empty_like(output_0_pt)
+            module.run_with_tensors(inputs, [y])
+            self.assertTrue(torch.allclose(output_0_pt, y, atol=0.03, rtol=0.03))
+
     def _test_group_gemm_fusion(
         self,
         m,
@@ -303,9 +596,9 @@ def _test_group_gemm_fusion(
         has_relu=False,
         has_sigmoid=False,
         should_fail=False,
+        dtype="float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Running _test_group_gemm_fusion, m = {m}, nk_groups = {nk_groups}, "
             f"has_bias = {has_bias}, has_relu = {has_relu}, has_sigmoid = {has_sigmoid}, "
             f"should_fail = {should_fail}",
@@ -327,7 +620,9 @@ def _test_group_gemm_fusion(
             op = ops.gemm_rcr
             op_type = "group_gemm_rcr"
 
-        group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+        group_input_tensors = _prepare_input_tensors(
+            m, nk_groups, dtype, has_bias=has_bias
+        )
         graph = []
         for i, group in enumerate(group_input_tensors):
             Y = op()(*group)
@@ -349,7 +644,7 @@ def _test_group_gemm_fusion(
             else:
                 assert not has_op(sorted_ops, op_type)
 
-    def test_group_gemm_fusion(self):
+    def test_group_gemm_fusion_float16(self):
         self._test_group_gemm_fusion(1024, [[16, 64], [32, 32]])
         self._test_group_gemm_fusion(1024, [[16, 64], [32, 40]], has_bias=False)
         self._test_group_gemm_fusion(
@@ -363,6 +658,19 @@ def test_group_gemm_fusion(self):
         self._test_group_gemm_fusion(1024, [[16, 44], [32, 32]], should_fail=True)
         self._test_group_gemm_fusion(1024, [[16, 13], [32, 1]], should_fail=True)
 
+    def test_group_gemm_fusion_float32_sm80(self):
+        self._test_group_gemm_fusion(32, [[16, 64], [32, 32]], dtype="float32")
+        self._test_group_gemm_fusion(
+            32, [[16, 64], [32, 40]], has_bias=False, dtype="float32"
+        )
+        self._test_group_gemm_fusion(
+            32, [[16, 64], [32, 40], [75, 128]], has_relu=True, dtype="float32"
+        )
+        # test misalignment
+        self._test_group_gemm_fusion(
+            32, [[16, 13], [32, 1]], should_fail=True, dtype="float32"
+        )
+
     def _test_split_group_gemm_fusion(
         self,
         m,
@@ -371,9 +679,9 @@ def _test_split_group_gemm_fusion(
         split_dim=1,
         should_fail=False,
         num_group_ops=2,
+        dtype="float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"Running _test_split_group_gemm_fusion, m = {m}, nk_groups_1 = {nk_groups_1}, "
             f"nk_groups_2 = {nk_groups_2}, split_dim = {split_dim}, should_fail: {should_fail}, "
             f"num_group_ops = {num_group_ops}",
@@ -381,10 +689,15 @@ def _test_split_group_gemm_fusion(
         op_type = "group_gemm_rcr_bias"
 
         inputs1 = _prepare_input_tensors(
-            m, nk_groups_1, has_bias=True, only_params=True
+            m, nk_groups_1, dtype, has_bias=True, only_params=True
         )
         inputs2 = _prepare_input_tensors(
-            m, nk_groups_2, start=len(nk_groups_1), has_bias=True, only_params=False
+            m,
+            nk_groups_2,
+            dtype,
+            start=len(nk_groups_1),
+            has_bias=True,
+            only_params=False,
         )
 
         if split_dim == 1:
@@ -392,7 +705,7 @@ def _test_split_group_gemm_fusion(
             K = sum(split_sizes)
             X = Tensor(
                 shape=[m, K],
-                dtype="float16",
+                dtype=dtype,
                 name="input",
                 is_input=True,
             )
@@ -400,7 +713,7 @@ def _test_split_group_gemm_fusion(
             split_sizes = m
             X = Tensor(
                 shape=[m * len(nk_groups_1), nk_groups_1[0][1]],
-                dtype="float16",
+                dtype=dtype,
                 name="input",
                 is_input=True,
             )
@@ -438,7 +751,7 @@ def _test_split_group_gemm_fusion(
                 assert not has_op(sorted_ops, "split")
                 assert count_ops(sorted_ops, op_type) == num_group_ops
 
-    def test_split_group_gemm_fusion(self):
+    def test_split_group_gemm_fusion_float16(self):
         self._test_split_group_gemm_fusion(
             1024, [[16, 64], [16, 40], [16, 128]], [[1, 16], [3, 48]], num_group_ops=2
         )
@@ -458,6 +771,26 @@ def test_split_group_gemm_fusion(self):
             num_group_ops=1,
         )
 
+    def test_split_group_gemm_fusion_float32_sm80(self):
+        self._test_split_group_gemm_fusion(
+            32,
+            [[16, 64], [16, 40], [16, 128]],
+            [[1, 16], [3, 48]],
+            num_group_ops=2,
+            dtype="float32",
+        )
+        self._test_split_group_gemm_fusion(
+            48,
+            [[16, 64], [16, 64], [16, 64]],
+            [[1, 16], [3, 48]],
+            split_dim=0,
+            should_fail=True,
+            num_group_ops=1,
+            dtype="float32",
+        )
+
+
+filter_test_cases_by_test_env(GroupOpTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_memory_planning.py b/tests/unittest/compiler/test_memory_planning.py
index e2ce76b5a..a819c3d5d 100644
--- a/tests/unittest/compiler/test_memory_planning.py
+++ b/tests/unittest/compiler/test_memory_planning.py
@@ -22,12 +22,21 @@
 from aitemplate.compiler.base import Operator
 from aitemplate.frontend import IntImm, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
 
 
 class MemoryPlanningTestCase(unittest.TestCase):
-    def test_memory_planning_with_tensor_views(self):
+    @parameterized.expand([("float16"), ("float32")])
+    def test_memory_planning_with_tensor_views(self, dtype):
         target = detect_target()
-        dtype = "float16"
+        if dtype == "float32" and target.name == "rocm":
+            self.skipTest("float tensors not supported by rocm")
+
         # batch_size = [4, 16] # reduce_sum doesn't work with dynamic shape
         batch_size = [4]
         in_shape = [32, 16, 8]
@@ -72,7 +81,7 @@ def test_memory_planning_with_tensor_views(self):
 
         for b in batch_size:
             X_shape = [b] + in_shape
-            x_pt = torch.randn(X_shape).cuda().half()
+            x_pt = get_random_torch_tensor(X_shape, dtype)
             t0_pt = torch.sum(x_pt, dim=3, keepdim=True)
             t1_pt = torch.sum(t0_pt, dim=2, keepdim=True)
             t2_pt = torch.reshape(t1_pt, [-1, 32])
@@ -80,7 +89,7 @@ def test_memory_planning_with_tensor_views(self):
             t4_pt = torch.sum(t3_pt, dim=2, keepdim=False)
             out_pt = torch.add(t2_pt, t4_pt).flatten()
 
-            out = torch.empty(out_pt.size()).cuda().half()
+            out = get_torch_empty_tensor(out_pt.size(), dtype)
             module.run_with_tensors([x_pt], [out])
             self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
 
diff --git a/tests/unittest/compiler/test_merge_slice_ops.py b/tests/unittest/compiler/test_merge_slice_ops.py
new file mode 100644
index 000000000..18c5c1469
--- /dev/null
+++ b/tests/unittest/compiler/test_merge_slice_ops.py
@@ -0,0 +1,519 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class MergeSliceOpsTestCase(unittest.TestCase):
+    BATCH_SIZE = 1024
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(MergeSliceOpsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_slice_slice_basic(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # y = concat(x2, slice_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        M2 = 3
+        N2 = slice_2.shape()[-1].value()
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        Y = ops.concatenate()([X2, slice_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            y_pt = torch.cat([x2_pt, slice_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_basic(self):
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 1],
+            first_slice_end_indices=[None, None, 15],
+            second_slice_start_indices=[0, 1, 3],
+            second_slice_end_indices=[None, None, 5],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_basic_0",
+            dtype="float16",
+        )
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 0],
+            first_slice_end_indices=[None, 10, None],
+            second_slice_start_indices=[0, 2, 0],
+            second_slice_end_indices=[None, 4, None],
+            expected_ops_cnt=2,
+            expected_slice_cnt=0,
+            test_name="slice_slice_basic_1",
+            dtype="float16",
+        )
+        self._test_slice_slice_basic(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 3],
+            first_slice_end_indices=[None, 10, 12],
+            second_slice_start_indices=[0, 2, 1],
+            second_slice_end_indices=[None, None, 6],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_basic_2",
+            dtype="float16",
+        )
+
+    def _test_slice_slice_2(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        third_slice_start_indices,
+        third_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # slice_3 = slice(slice_2)
+        # y = add(slice_3, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        slice_3 = ops.dynamic_slice()(
+            slice_2,
+            start_indices=third_slice_start_indices,
+            end_indices=third_slice_end_indices,
+        )
+        M2 = slice_3.shape()[-2].value()
+        N2 = slice_3.shape()[-1].value()
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        Y = ops.elementwise(FuncEnum.ADD)(slice_3, X2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            third_slice_indices = [
+                slice(i, j)
+                for i, j in zip(third_slice_start_indices, third_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            slice_3_pt = slice_2_pt[third_slice_indices]
+            y_pt = slice_3_pt + x2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_2(self):
+        self._test_slice_slice_2(
+            M0=20,
+            N0=30,
+            first_slice_start_indices=[0, 1, 2],
+            first_slice_end_indices=[None, 15, 28],
+            second_slice_start_indices=[0, 2, 2],
+            second_slice_end_indices=[None, 10, 9],
+            third_slice_start_indices=[0, 2, 1],
+            third_slice_end_indices=[None, 5, 3],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_2",
+            dtype="float16",
+        )
+        self._test_slice_slice_2(
+            M0=20,
+            N0=30,
+            first_slice_start_indices=[0, 1, 2],
+            first_slice_end_indices=[None, 15, 28],
+            second_slice_start_indices=[0, 2, 2],
+            second_slice_end_indices=[None, None, 9],
+            third_slice_start_indices=[0, 2, 1],
+            third_slice_end_indices=[None, 5, None],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_2",
+            dtype="float16",
+        )
+
+    def _test_slice_slice_3(
+        self,
+        input_shape,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x0)
+        # slice_1 = slice(add_0)
+        # Y = slice(slice_1)
+        X0 = Tensor(
+            shape=input_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        Y = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        x0_pt = get_random_torch_tensor(input_shape, dtype)
+
+        first_slice_indices = [
+            slice(i, j)
+            for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+        ]
+        second_slice_indices = [
+            slice(i, j)
+            for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+        ]
+        add_0_pt = x0_pt + x0_pt
+        slice_1_pt = add_0_pt[first_slice_indices]
+        y_pt = slice_1_pt[second_slice_indices]
+
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        inputs = {"x0": x0_pt}
+        outputs = [y]
+        module.run_with_tensors(inputs, outputs)
+        torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_slice_slice_3(self):
+        self._test_slice_slice_3(
+            input_shape=[2, 3, 2],
+            first_slice_start_indices=[0, 1, 0],
+            first_slice_end_indices=[None, 2, None],
+            second_slice_start_indices=[0, 0, 1],
+            second_slice_end_indices=[None, None, 2],
+            expected_ops_cnt=2,
+            expected_slice_cnt=1,
+            test_name="slice_slice_3",
+            dtype="float16",
+        )
+        self._test_slice_slice_3(
+            input_shape=[2, 1, 10, 10, 10],
+            first_slice_start_indices=[0, 0, 1, 0, 0],
+            first_slice_end_indices=[None, None, -1, None, None],
+            second_slice_start_indices=[0, 0, 0, 1, 0],
+            second_slice_end_indices=[None, None, None, 2, None],
+            expected_ops_cnt=2,
+            expected_slice_cnt=1,
+            test_name="slice_slice_3",
+            dtype="float16",
+        )
+
+    def _test_non_fusible_slice_slice(
+        self,
+        M0,
+        N0,
+        first_slice_start_indices,
+        first_slice_end_indices,
+        second_slice_start_indices,
+        second_slice_end_indices,
+        expected_ops_cnt,
+        expected_slice_cnt,
+        test_name,
+        dtype="float16",
+    ):
+        # make a graph like below
+        # add_0 = add(x0, x1)
+        # slice_1 = slice(add_0)
+        # slice_2 = slice(slice_1)
+        # y = concat(x2, slice_1, slice_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N0)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        slice_1 = ops.dynamic_slice()(
+            add_0,
+            start_indices=first_slice_start_indices,
+            end_indices=first_slice_end_indices,
+        )
+        slice_1_N = slice_1.shape()[-1].value()
+        slice_2 = ops.dynamic_slice()(
+            slice_1,
+            start_indices=second_slice_start_indices,
+            end_indices=second_slice_end_indices,
+        )
+        M2 = 3
+        N2 = slice_2.shape()[-1].value()
+        assert N0 == slice_1_N == N2, f"expected {N0=} == {slice_1_N=} == {N2=}"
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        Y = ops.concatenate()([X2, slice_1, slice_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_ops_cnt)
+        slice_ops = [op for op in sorted_ops if op._attrs["op"] == "dynamic_slice"]
+        self.assertEqual(len(slice_ops), expected_slice_cnt)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N0], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N2], dtype)
+
+            first_slice_indices = [
+                slice(i, j)
+                for i, j in zip(first_slice_start_indices, first_slice_end_indices)
+            ]
+            second_slice_indices = [
+                slice(i, j)
+                for i, j in zip(second_slice_start_indices, second_slice_end_indices)
+            ]
+            add_0_pt = x0_pt + x1_pt
+            slice_1_pt = add_0_pt[first_slice_indices]
+            slice_2_pt = slice_1_pt[second_slice_indices]
+            y_pt = torch.cat([x2_pt, slice_1_pt, slice_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    def test_non_fusible_slice_slice(self):
+        self._test_non_fusible_slice_slice(
+            M0=10,
+            N0=18,
+            first_slice_start_indices=[0, 2, 0],
+            first_slice_end_indices=[None, 10, None],
+            second_slice_start_indices=[0, 2, 0],
+            second_slice_end_indices=[None, 4, None],
+            expected_ops_cnt=3,
+            expected_slice_cnt=1,
+            test_name="slice_slice_non_fusible",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_merge_view_ops.py b/tests/unittest/compiler/test_merge_view_ops.py
new file mode 100644
index 000000000..561fbfc24
--- /dev/null
+++ b/tests/unittest/compiler/test_merge_view_ops.py
@@ -0,0 +1,414 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    count_ops,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    graph_has_op,
+)
+from aitemplate.utils import graph_utils
+
+
+class MergeViewOpsTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def test_basic(self):
+        """
+        Check that we convert a sequence of reshape(unsqueeze(...)) into a
+        single reshape() call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        y = ops.reduce_sum(dim=1)(x2)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_basic",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 3)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+
+        expected = torch.reshape(x0_pt, y_shape)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_sequential_views(self):
+        """
+        Check that we convert a sequence of reshape(unsqueeze(reshape(...)))
+        into a single reshape() call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 2, 4]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        x3 = ops.reshape()(x2, [8, 1, 2, 4])
+        y = ops.reduce_sum(dim=1)(x3)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_multiple_sequential_views",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 3)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+
+        expected = torch.reshape(x0_pt, y_shape)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_dst_view_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> unsqueeze -> ...
+                            |--> unsqueeze -> ...
+
+        We want to merge both unsqueeze calls into the preceding reshape call.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+        x3 = ops.unsqueeze(dim=2)(x1)
+
+        y0 = ops.reduce_sum(dim=1)(x2)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=2)(x3)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_multiple_dst_view_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 5)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 2)
+
+        y_expected = torch.reshape(x0_pt, [8, 8])
+        torch.testing.assert_close(y_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_multiple_dst_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> unsqueeze -> ...
+                            |--> ...
+
+        We cannot eliminate x1 since it has a non-view-op destination, but we
+        can still merge the reshape and unsqueeze operators.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y0_shape = [8]
+        y1_shape = [8, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.unsqueeze(dim=1)(x1)
+
+        y0 = ops.reduce_sum(dim=1)(x1)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=1)(x2)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y0_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y1_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_multiple_dst_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 5)
+        self.assertFalse(graph_has_op(result_graph, "unsqueeze"))
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 2)
+
+        y0_expected = torch.sum(torch.reshape(x0_pt, [8, 8]), 1)
+        y1_expected = torch.reshape(x0_pt, y1_shape)
+        torch.testing.assert_close(y0_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y1_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape(self):
+        """
+        Given reshape(reshape(x, shape0), shape1), where shape1 is identical to
+        x's original shape, we can eliminate both reshape ops.
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [2, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.reshape()(x1, x0_shape)
+
+        y = ops.reduce_sum(dim=1)(x2)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_identity_reshape",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertFalse(graph_has_op(result_graph, "reshape"))
+        expected = torch.sum(x0_pt, 1)
+        torch.testing.assert_close(expected, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_multiple_dst_ops(self):
+        """
+        Given
+
+          x0 -> reshape -> x1 -> reshape -> x2 -> op1
+                              -> op2
+
+        If x2 == x0, we can transform that into
+
+          x0 -> op1
+             -> reshape -> x1 -> op2
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y0_shape = [2, 8]
+        y1_shape = [8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+        x2 = ops.reshape()(x1, x0_shape)
+
+        y0 = ops.reduce_sum(dim=1)(x2)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        y1 = ops.reduce_sum(dim=1)(x1)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y0_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y1_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_identity_reshape_multiple_dst_ops",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        self.assertEqual(len(result_graph), 4)
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 1)
+
+        y0_expected = torch.sum(x0_pt, 1)
+        y1_expected = torch.sum(torch.reshape(x0_pt, [8, 8]), 1)
+        torch.testing.assert_close(y0_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y1_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_in_out_conflict(self):
+        """
+        If x is an input and y is an output tensor, then we can only eliminate
+        one view op in the following example:
+
+          y = reshape(reshape(x, y_shape), x_original_shape)
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        x1 = ops.reshape()(x0, [8, 8])
+
+        y = ops.reshape()(x1, x0_shape)
+        y._attrs["name"] = "y"
+        y._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y_pt = get_torch_empty_tensor(x0_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            y,
+            target,
+            "./tmp",
+            "test_identity_reshape_in_out_conflict",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y": y_pt})
+
+        self.assertEqual(len(result_graph), 2)
+        self.assertTrue(graph_has_op(result_graph, "reshape"))
+        torch.testing.assert_close(x0_pt, y_pt, atol=5e-2, rtol=5e-2)
+
+    def test_identity_reshape_out_out_conflict(self):
+        """
+        If y0 and y1 are both output tensors, then we can only eliminate one
+        view op in the following example:
+
+          y1 = reshape(reshape(y0, some_shape), y0_original_shape)
+        """
+        dtype = "float"
+        x0_shape = [2, 4, 8]
+        y_shape = [2, 8]
+
+        x0 = Tensor(
+            shape=x0_shape,
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        y0 = ops.reduce_sum(dim=1)(x0)
+        y0._attrs["name"] = "y0"
+        y0._attrs["is_output"] = True
+
+        x1 = ops.reshape()(y0, [4, 4])
+
+        y1 = ops.reshape()(x1, y_shape)
+        y1._attrs["name"] = "y1"
+        y1._attrs["is_output"] = True
+
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        y0_pt = get_torch_empty_tensor(y_shape, dtype)
+        y1_pt = get_torch_empty_tensor(y_shape, dtype)
+
+        target = detect_target()
+        module = compile_model(
+            [y0, y1],
+            target,
+            "./tmp",
+            "test_identity_reshape_out_out_conflict",
+        )
+        result_graph = module.debug_sorted_graph
+        module.run_with_tensors({"x0": x0_pt}, {"y0": y0_pt, "y1": y1_pt})
+
+        y_expected = torch.sum(x0_pt, 1)
+
+        self.assertEqual(len(result_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(result_graph)
+        self.assertEqual(count_ops(sorted_ops, "reshape"), 1)
+        torch.testing.assert_close(y_expected, y0_pt, atol=5e-2, rtol=5e-2)
+        torch.testing.assert_close(y_expected, y1_pt, atol=5e-2, rtol=5e-2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_move_view_ops.py b/tests/unittest/compiler/test_move_view_ops.py
new file mode 100644
index 000000000..3493f7c83
--- /dev/null
+++ b/tests/unittest/compiler/test_move_view_ops.py
@@ -0,0 +1,1997 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class MoveViewOpsTestCase(unittest.TestCase):
+    BATCH_SIZE = 1024
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(MoveViewOpsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_non_movable_reshape_cat(self, M0, M1, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1, dim=1)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2, dim=2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, M2, IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim_1 = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim_1)
+        reshape_to_shape_1 = [-1, M2, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        cat_dim_2 = 2
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim_2)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 5)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim_1)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim_2)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_movable_reshape_cat(self):
+        self._test_non_movable_reshape_cat(
+            M0=4,
+            M1=2,
+            N=4,
+            test_name="test_non_movable_reshape_cat",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic(
+        self, M0, M1, M2, N, test_name, dtype="float16", non_movable=False
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        if non_movable is True:
+            assert (M0 + M1) % 3 == 0, "(M0 + M1) * N must be divisible by 3"
+            X2_M = (M0 + M1) * N // 3
+            X2_N = 3
+            reshape_to_shape_1 = [-1, X2_M, X2_N]
+        else:
+            reshape_to_shape_1 = [-1, M0 + M1, N]
+            X2_M = M2
+            X2_N = N
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(X2_M), IntImm(X2_N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        if non_movable is True:
+            expected_num_tensors = 5
+            # reshape can be fused into the second cat
+            expected_num_ops = 2
+        else:
+            expected_num_tensors = 4
+            expected_num_ops = 1
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, X2_M, X2_N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic(self):
+        self._test_move_reshape_cat_basic(
+            M0=4,
+            M1=2,
+            M2=6,
+            N=4,
+            test_name="test_move_reshape_cat_basic_non_movable",
+            dtype="float16",
+            non_movable=True,
+        )
+        self._test_move_reshape_cat_basic(
+            M0=1,
+            M1=5,
+            M2=7,
+            N=3,
+            test_name="test_move_reshape_cat_basic",
+            dtype="float16",
+        )
+        self._test_move_reshape_cat_basic(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_basic",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic_2(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # reshape_0 = reshape(x0)
+        # reshape_1 = reshape(x1)
+        # concat_2 = concatenate(reshape_0, x3, reshape_1)
+        # reshape_3 = reshape(concat_2)
+        # y = concatenate(x2, reshape_3, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        assert M0 % 2 == 0, f"{M0=} must be divisible by 2"
+        assert N % 2 == 0, f"{N=} must be divisible by 2"
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 // 2), IntImm(N * 2)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N // 2)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        reshape_0 = ops.reshape()(X0, [-1, M0, N])
+        reshape_1 = ops.reshape()(X1, [-1, M1, N])
+        concat_2 = ops.concatenate()([reshape_0, X3, reshape_1], dim=cat_dim)
+        reshape_to_shape_3 = [-1, (M0 + M0 + M1) * 2, N // 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to_shape_3)
+        Y = ops.concatenate()([X2, reshape_3, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 5)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 // 2, N * 2], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N // 2], dtype)
+            x3_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            reshape_0_pt = torch.reshape(x0_pt, [-1, M0, N])
+            reshape_1_pt = torch.reshape(x1_pt, [-1, M1, N])
+            concat_2_pt = torch.cat([reshape_0_pt, x3_pt, reshape_1_pt], dim=cat_dim)
+            reshape_3_pt = torch.reshape(concat_2_pt, reshape_to_shape_3)
+            y_pt = torch.cat([x2_pt, reshape_3_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic_2(self):
+        self._test_move_reshape_cat_basic_2(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_basic_2",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_basic_3(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x0)
+        # reshape_1 = reshape(concat_0)
+        # y = concatenate(reshape_1, x2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X0], dim=cat_dim)
+        reshape_to_shape_1 = [-1, M0 + M0, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        Y = ops.concatenate()([reshape_1, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 3)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x0_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            y_pt = torch.cat([reshape_1_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_move_reshape_cat_basic_3(self):
+        self._test_move_reshape_cat_basic_3(
+            M0=1,
+            M2=7,
+            N=3,
+            test_name="test_move_reshape_cat_basic_3",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_1(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_2 = reshape(concat_0)
+        # concat_4 = concatenate(x2, reshape_2)
+        # flatten_5 = flatten(concat_4)
+        # concat_6 = concatenate(x0, flatten_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_0, [-1, M0 + M1, N])
+        concat_4 = ops.concatenate()([X2, reshape_2], dim=cat_dim)
+        flatten_5 = ops.flatten(start_dim=1, end_dim=-1)(concat_4)
+        Y = ops.concatenate()([X0, flatten_5], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_0_pt, [-1, M0 + M1, N])
+            concat_4_pt = torch.cat([x2_pt, reshape_2_pt], dim=cat_dim)
+            flatten_5_pt = torch.flatten(concat_4_pt, 1, -1)
+            y_pt = torch.cat([x0_pt, flatten_5_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_1(self):
+        self._test_move_reshape_cat_1(
+            M0=2,
+            M1=2,
+            M2=6,
+            N=8,
+            test_name="test_move_reshape_cat_1",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_2(self, M0, M1, M2, M3, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # concat_1 = concatenate(x0, x1)
+        # reshape_2 = reshape(concat_0)
+        # reshape_3 = reshape(concat_1)
+        # concat_4 = concatenate(x2, reshape_2, reshape_3, x3, reshape_2)
+        # flatten_5 = flatten(concat_4)
+        # concat_6 = concatenate(x0, flatten_5, x1, flatten_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        concat_1 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_0, [-1, M0 + M1, N])
+        reshape_3 = ops.reshape()(concat_1, [-1, M0 + M1, N])
+        concat_4 = ops.concatenate()(
+            [X2, reshape_2, reshape_3, X3, reshape_2], dim=cat_dim
+        )
+        flatten_5 = ops.flatten(start_dim=1, end_dim=-1)(concat_4)
+        Y = ops.concatenate()([X0, flatten_5, X1, flatten_5], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "concatenate")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            concat_1_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_0_pt, [-1, M0 + M1, N])
+            reshape_3_pt = torch.reshape(concat_1_pt, [-1, M0 + M1, N])
+            concat_4_pt = torch.cat(
+                [x2_pt, reshape_2_pt, reshape_3_pt, x3_pt, reshape_2_pt], dim=cat_dim
+            )
+            flatten_5_pt = torch.flatten(concat_4_pt, 1, -1)
+            y_pt = torch.cat([x0_pt, flatten_5_pt, x1_pt, flatten_5_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_2(self):
+        self._test_move_reshape_cat_2(
+            M0=2,
+            M1=2,
+            M2=6,
+            M3=4,
+            N=8,
+            test_name="test_move_reshape_cat_2",
+            dtype="float16",
+        )
+
+    def _test_move_reshape_cat_3(self, M0, M1, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # reshape_2 = reshape(x2)
+        # y = concatenate(reshape_2, reshape_1, reshape_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_1 = ops.reshape()(concat_0, [-1, (M0 + M1) * N])
+        reshape_2 = ops.reshape()(X2, [-1, M2 * N])
+        Y = ops.concatenate()([reshape_2, reshape_1, reshape_2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, [-1, (M0 + M1) * N])
+            reshape_2_pt = torch.reshape(x2_pt, [-1, M2 * N])
+            y_pt = torch.cat([reshape_2_pt, reshape_1_pt, reshape_2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_reshape_cat_3(self):
+        self._test_move_reshape_cat_3(
+            M0=4,
+            M1=6,
+            M2=3,
+            N=4,
+            test_name="test_move_reshape_cat_3",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)
+        # concat_1 = concatenate(add_0, x2)
+        # reshape_2 = reshape(concat_1)
+        # y = concatenate(reshape_2, x3)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        Y = ops.concatenate()([reshape_2, X3], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            y_pt = torch.cat([reshape_2_pt, x3_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat(self):
+        self._test_move_strided_reshape_cat(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=8,
+            test_name="test_move_strided_reshape_cat",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat(
+            M0=4,
+            M1=4,
+            M2=5,
+            M3=10,
+            N=7,
+            test_name="test_move_strided_reshape_cat",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_2(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x0)
+        # reshape_1 = reshape(add_0)
+        # add_2 = add(x1, x1)
+        # concat_3 = concatenate(x2, reshape_1, x2, add_2)
+        # reshape_4 = reshape(concat_3)
+        # y = concatenate(x3, reshape_4, x3)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        reshape_1 = ops.reshape()(add_0, [-1, M0 * N])
+        add_2 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        concat_3 = ops.concatenate()([X2, reshape_1, X2, add_2], dim=cat_dim)
+        reshape_to_shape_4 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, reshape_1, X2, add_2]]) // N
+        )
+        reshape_4 = ops.reshape()(concat_3, [-1, reshape_to_shape_4, N])
+        Y = ops.concatenate()([X3, reshape_4, X3], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            add_0_pt = x0_pt + x0_pt
+            reshape_1_pt = torch.reshape(add_0_pt, [batch, M0 * N])
+            add_2_pt = x1_pt + x1_pt
+            concat_3_pt = torch.cat([x2_pt, reshape_1_pt, x2_pt, add_2_pt], dim=cat_dim)
+            reshape_4_pt = torch.reshape(concat_3_pt, [-1, reshape_to_shape_4, N])
+            y_pt = torch.cat([x3_pt, reshape_4_pt, x3_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_2(self):
+        self._test_move_strided_reshape_cat_2(
+            M0=4,
+            M1=6,
+            M2=9,
+            M3=16,
+            N=8,
+            test_name="test_move_strided_reshape_cat_2",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_3(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # slice_1 = slice(x4)
+        # slice_2 = slice(x4)
+        # add_0 = add(x0, x0)
+        # reshape_1 = reshape(add_0)
+        # add_2 = add(x1, x1)
+        # flatten_3 = flatten(add_2)
+        # concat_4 = concatenate(x2, slice_0, slice_1, reshape_1, slice_2, flatten_3) # 2d
+        # add_5 = add(x3, x3)
+        # reshape_6 = reshape(add_5)
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7, reshape_6) # 3d
+        # add_9 = add(x0, x0)
+        # flatten_10 = flatten(concat_8) # 2d
+        # reshape_11 = reshape(add_9) # 2d
+        # y = concatenate(x1, reshape_11, flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_start_indices_1 = [None, 3 * N]
+        slice_end_indices_1 = [None, 4 * N]
+        slice_start_indices_2 = [None, 4 * N]
+        slice_end_indices_2 = [None, 8 * N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        slice_1 = ops.dynamic_slice()(X4, slice_start_indices_1, slice_end_indices_1)
+        slice_2 = ops.dynamic_slice()(X4, slice_start_indices_2, slice_end_indices_2)
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        reshape_1 = ops.reshape()(add_0, [-1, M0 * N])
+        add_2 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        flatten_3 = ops.flatten(start_dim=1, end_dim=-1)(add_2)
+        concat_4 = ops.concatenate()(
+            [X2, slice_0, slice_1, reshape_1, slice_2, flatten_3], dim=cat_dim
+        )
+        add_5 = ops.elementwise(FuncEnum.ADD)(X3, X3)
+        reshape_6 = ops.reshape()(add_5, [-1, M3, N])
+        reshape_to_shape_7 = (
+            sum(
+                [
+                    t.shape()[cat_dim].value()
+                    for t in [X2, slice_0, slice_1, reshape_1, slice_2, flatten_3]
+                ]
+            )
+            // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7, reshape_6], dim=cat_dim)
+        add_9 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        reshape_11 = ops.reshape()(add_9, [-1, M0 * N])
+        Y = ops.concatenate()([X1, reshape_11, flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_indices_1 = [
+                slice(i, j) for i, j in zip(slice_start_indices_1, slice_end_indices_1)
+            ]
+            slice_indices_2 = [
+                slice(i, j) for i, j in zip(slice_start_indices_2, slice_end_indices_2)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+            slice_1_pt = x4_pt[slice_indices_1]
+            slice_2_pt = x4_pt[slice_indices_2]
+
+            add_0_pt = x0_pt + x0_pt
+            reshape_1_pt = torch.reshape(add_0_pt, [batch, M0 * N])
+            add_2_pt = x1_pt + x1_pt
+            flatten_3_pt = torch.flatten(add_2_pt, 1, -1)
+            concat_4_pt = torch.cat(
+                [x2_pt, slice_0_pt, slice_1_pt, reshape_1_pt, slice_2_pt, flatten_3_pt],
+                dim=cat_dim,
+            )
+            add_5_pt = x3_pt + x3_pt
+            reshape_6_pt = torch.reshape(add_5_pt, [-1, M3, N])
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt, reshape_6_pt], dim=cat_dim)
+            add_9_pt = x0_pt + x0_pt
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            reshape_11_pt = torch.reshape(add_9_pt, [-1, M0 * N])
+            y_pt = torch.cat([x1_pt, reshape_11_pt, flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_3(self):
+        self._test_move_strided_reshape_cat_3(
+            M0=4,
+            M1=6,
+            M2=9,
+            M3=16,
+            N=8,
+            test_name="test_move_strided_reshape_cat_3",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_4(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # concat_4 = concatenate(x2, slice_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # y = concatenate(x0, reshape_7) # 3d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, slice_0], dim=cat_dim)
+        reshape_to_shape_7 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, slice_0]]) // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        Y = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+
+            concat_4_pt = torch.cat([x2_pt, slice_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            y_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_4(self):
+        self._test_move_strided_reshape_cat_4(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_4",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_5(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # slice_0 = slice(x4)
+        # concat_4 = concatenate(x2, slice_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7) # 3d
+        # flatten_10 = reshape(concat_8) # 2d
+        # y = concatenate(flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        slice_start_indices_0 = [None, 0]
+        slice_end_indices_0 = [None, N]
+        slice_0 = ops.dynamic_slice()(X4, slice_start_indices_0, slice_end_indices_0)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, slice_0], dim=cat_dim)
+        reshape_to_shape_7 = (
+            sum([t.shape()[cat_dim].value() for t in [X2, slice_0]]) // N
+        )
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        Y = ops.concatenate()([flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            slice_indices_0 = [
+                slice(i, j) for i, j in zip(slice_start_indices_0, slice_end_indices_0)
+            ]
+            slice_0_pt = x4_pt[slice_indices_0]
+
+            concat_4_pt = torch.cat([x2_pt, slice_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            y_pt = torch.cat([flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_5(self):
+        self._test_move_strided_reshape_cat_5(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_5",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_6(self, M0, M2, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # add_0 = add(x4, x4)
+        # concat_4 = concatenate(x2, add_0) # 2d
+        # reshape_7 = reshape(concat_4)
+        # concat_8 = concatenate(x0, reshape_7) # 3d
+        # flatten_10 = reshape(concat_8) # 2d
+        # y = concatenate(flatten_10, x2) # 2d
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        M4 = 10 * M0
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4 * N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+
+        add_0 = ops.elementwise(FuncEnum.ADD)(X4, X4)
+        cat_dim = 1
+        concat_4 = ops.concatenate()([X2, add_0], dim=cat_dim)
+        reshape_to_shape_7 = sum([t.shape()[cat_dim].value() for t in [X2, add_0]]) // N
+        reshape_7 = ops.reshape()(concat_4, [-1, reshape_to_shape_7, N])
+        concat_8 = ops.concatenate()([X0, reshape_7], dim=cat_dim)
+        flatten_10 = ops.flatten(start_dim=1, end_dim=-1)(concat_8)
+        Y = ops.concatenate()([flatten_10, X2], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 2)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+        output_tensors = {op._attrs["outputs"][0] for op in sorted_ops}
+        self.assertEqual(len(output_tensors), 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4 * N], dtype)
+
+            add_0_pt = x4_pt + x4_pt
+            concat_4_pt = torch.cat([x2_pt, add_0_pt], dim=cat_dim)
+            reshape_7_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_7, N])
+            concat_8_pt = torch.cat([x0_pt, reshape_7_pt], dim=cat_dim)
+            flatten_10_pt = torch.flatten(concat_8_pt, 1, -1)
+            y_pt = torch.cat([flatten_10_pt, x2_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x2": x2_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_6(self):
+        self._test_move_strided_reshape_cat_6(
+            M0=4,
+            M2=9,
+            N=8,
+            test_name="test_move_strided_reshape_cat_6",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_7(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)
+        # concat_1 = concatenate(add_0, x2)
+        # reshape_2 = reshape(concat_1)
+        # add_3 = add(x4, reshape_2)
+        # concat_4 = concatenate(x3, reshape_2, x3)
+        # reduce_5 = reduce_sum(add_3)
+        # reduce_6 = reduce_sum(concat_5)
+        # y = add(reduce_5, reduce_6)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M0 + M2), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(X4, reshape_2)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_5 = ops.reduce_sum(reduce_dim)(add_3)
+        reduce_6 = ops.reduce_sum(reduce_dim)(concat_4)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_5, reduce_6)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 6)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = x4_pt + reshape_2_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reduce_5_pt = torch.sum(add_3_pt, reduce_dim)
+            reduce_6_pt = torch.sum(concat_4_pt, reduce_dim)
+            y_pt = reduce_5_pt + reduce_6_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt, "x3": x3_pt, "x4": x4_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.05, rtol=0.05)
+
+    def test_move_strided_reshape_cat_7(self):
+        self._test_move_strided_reshape_cat_7(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=8,
+            test_name="test_move_strided_reshape_cat_7",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat_7(
+            M0=4,
+            M1=4,
+            M2=5,
+            M3=3,
+            N=7,
+            test_name="test_move_strided_reshape_cat_7",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_8(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(bmm_crr_add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(bmm_crr_add_3, [-1, N * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # dynamic_slice + bmm cannot be fused because we can't generate
+        # any valid strided access
+        self.assertEqual(len(sorted_ops), 9)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(bmm_crr_add_3_pt, [-1, N * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_8(self):
+        self._test_move_strided_reshape_cat_8(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            N=4,
+            test_name="test_move_strided_reshape_cat_8",
+            dtype="float16",
+        )
+        self._test_move_strided_reshape_cat_8(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=3,
+            N=4,
+            test_name="test_move_strided_reshape_cat_8",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_9(
+        self, M0, M1, M2, M3, M7, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, concat_1, x3) # 2d
+        # reshape_5 = reshape(concat_4) # 3d
+        # add_6 = add(reshape_5, x6) # 3d
+        # concat_7 = concatenate(x7, reshape_5, x7) # 3d
+        # reduce_8 = reduce_sum(bmm_crr_add_3)
+        # reduce_9 = reduce_sum(add_6)
+        # reduce_10 = reduce_sum(concat_7)
+        # add_11 = add(reduce_8, reduce_9)
+        # y = add(add_11, reduce_10)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3 * N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)  # 2d
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)  # 2d
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, concat_1, X3], dim=cat_dim)  # 2d
+        M6 = sum([t.shape()[cat_dim].value() for t in [X3, concat_1, X3]])
+        assert M6 % N == 0, f"expected {M6=} is divisible by {N=}"
+        M6 = M6 // N
+        reshape_5 = ops.reshape()(concat_4, [-1, M6, N])  # 3d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(M6), IntImm(N)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        X7 = Tensor(
+            shape=[batch_dim, IntImm(M7), IntImm(N)],
+            dtype=dtype,
+            name="x7",
+            is_input=True,
+        )
+        concat_7 = ops.concatenate()([X7, reshape_5, X7], dim=cat_dim)  # 3d
+        reduce_dim = cat_dim
+        reduce_8 = ops.reduce_sum(reduce_dim)(bmm_crr_add_3)
+        reduce_9 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_10 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_11 = ops.elementwise(FuncEnum.ADD)(reduce_8, reduce_9)
+        Y = ops.elementwise(FuncEnum.ADD)(add_11, reduce_10)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3 * N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, M6, N], dtype)
+            x7_pt = get_random_torch_tensor([batch, M7, N], dtype)
+
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, concat_1_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, M6, N])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x7_pt, reshape_5_pt, x7_pt], dim=cat_dim)
+            reduce_8_pt = torch.sum(bmm_crr_add_3_pt, reduce_dim)
+            reduce_9_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_10_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_11_pt = reduce_8_pt + reduce_9_pt
+            y_pt = add_11_pt + reduce_10_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+                "x7": x7_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_9(self):
+        self._test_move_strided_reshape_cat_9(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            M7=8,
+            N=4,
+            test_name="test_move_strided_reshape_cat_9",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_10(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_rrr_add_3 = bmm_rrr_add(reshape_2, x4, x5) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(bmm_rrr_add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        assert M0 + M2 == N, f"expected {M0=} + {M2=} to be qual to {N=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, N, bmm_K])
+        # bmm_rrr_add_3[batch, N, N] = bmm_rrr_add(
+        #     reshape_2[batch, N, bmm_K], X4[bmm_K, N], X5[N]
+        # )
+        bmm_rrr_add_3 = ops.bmm_rrr_add()(reshape_2, X4, X5)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(bmm_rrr_add_3, [-1, N * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # dynamic_slice + bmm cannot be fused because we can't generate
+        # any valid strided access
+        self.assertEqual(len(sorted_ops), 9)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            bmm_rrr_add_3_pt = torch.matmul(reshape_2_pt, x4_pt) + x5_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(bmm_rrr_add_3_pt, [-1, N * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+                "x6": x6_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_10(self):
+        self._test_move_strided_reshape_cat_10(
+            M0=2,
+            M1=2,
+            M2=4,
+            M3=4,
+            N=6,
+            test_name="test_move_strided_reshape_cat_10",
+            dtype="float16",
+        )
+
+    def _test_move_strided_reshape_cat_multi_dsts(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # bmm_crr_add_3 = bmm_crr_add(reshape_2, x4, x5) # 3d
+        # reshape_4 = reshape(concat_1) # 3d
+        # concat_5 = concatenate(x3, reshape_4, x3) # 3d
+        # reduce_8 = reduce_sum(bmm_crr_add_3)
+        # reduce_9 = reduce_sum(concat_5)
+        # y = add(reduce_8, reduce_9)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        X5 = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="x5",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)  # 2d
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)  # 2d
+        bmm_K = M0 + M2
+        reshape_2 = ops.reshape()(concat_1, [-1, bmm_K, N])
+        # bmm_crr_add_3[batch, N, N] = bmm_crr_add(
+        #     reshape_2[batch, bmm_K, N], X4[bmm_K, N], X5[N]
+        # )
+        bmm_crr_add_3 = ops.bmm_crr_add()(reshape_2, X4, X5)
+        reshape_to_shape_4 = M0 + M2
+        reshape_4 = ops.reshape()(concat_1, [-1, reshape_to_shape_4, N])  # 3d
+        concat_5 = ops.concatenate()([X3, reshape_4, X3], dim=cat_dim)  # 2d
+        reduce_dim = cat_dim
+        reduce_8 = ops.reduce_sum(reduce_dim)(bmm_crr_add_3)
+        reduce_9 = ops.reduce_sum(reduce_dim)(concat_5)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_8, reduce_9)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 6)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([M4, N], dtype)
+            x5_pt = get_random_torch_tensor([N], dtype)
+
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, bmm_K, N])
+            reshape_2_trans_pt = torch.transpose(reshape_2_pt, -2, -1)
+            bmm_crr_add_3_pt = torch.matmul(reshape_2_trans_pt, x4_pt) + x5_pt
+            reshape_4_pt = torch.reshape(concat_1_pt, [-1, reshape_to_shape_4, N])
+            concat_5_pt = torch.cat([x3_pt, reshape_4_pt, x3_pt], dim=cat_dim)
+            reduce_8_pt = torch.sum(bmm_crr_add_3_pt, reduce_dim)
+            reduce_9_pt = torch.sum(concat_5_pt, reduce_dim)
+            y_pt = reduce_8_pt + reduce_9_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x5": x5_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_move_strided_reshape_cat_multi_dsts(self):
+        self._test_move_strided_reshape_cat_multi_dsts(
+            M0=4,
+            M1=4,
+            M2=6,
+            M3=4,
+            N=4,
+            test_name="test_move_strided_reshape_cat_multi_dsts",
+            dtype="float16",
+        )
+
+    def _test_non_movable_cat_reshape_cat_2(
+        self, M0, M1, M2, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x0) # 2d
+        # reshape_1 = reshape(concat_0) # 3d
+        # concat_2 = concat(reshape_1, x1) # 3d
+        # concat_3 = concatenate(concat_0, x2) # 2d
+        # reduce_4 = reduce_sum(concat_2)
+        # reduce_5 = reduce_sum(concat_3)
+        # y = add(reduce_4, reduce_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X0], dim=cat_dim)  # 2d
+        reshape_1_to_shape = [-1, M0 + M0, N]
+        reshape_1 = ops.reshape()(concat_0, reshape_1_to_shape)
+        concat_2 = ops.concatenate()([reshape_1, X1], dim=cat_dim)  # 3d
+        concat_3 = ops.concatenate()([concat_0, X2], dim=cat_dim)  # 2d
+        reduce_dim = cat_dim
+        reduce_4 = ops.reduce_sum(reduce_dim)(concat_2)
+        reduce_4_2 = ops.reduce_sum(reduce_dim)(reduce_4)
+        reduce_5 = ops.reduce_sum(reduce_dim)(concat_3)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_4_2, reduce_5)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 7)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 3)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x0_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_1_to_shape)
+            concat_2_pt = torch.cat([reshape_1_pt, x1_pt], dim=cat_dim)
+            concat_3_pt = torch.cat([concat_0_pt, x2_pt], dim=cat_dim)
+            reduce_4_pt = torch.sum(concat_2_pt, reduce_dim)
+            reduce_4_2_pt = torch.sum(reduce_4_pt, reduce_dim)
+            reduce_5_pt = torch.sum(concat_3_pt, reduce_dim)
+            y_pt = reduce_4_2_pt + reduce_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_movable_cat_reshape_cat_2(self):
+        self._test_non_movable_cat_reshape_cat_2(
+            M0=3,
+            M1=4,
+            M2=6,
+            N=4,
+            test_name="test_non_movable_cat_reshape_cat_2",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
index 515959224..b0626fa11 100644
--- a/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
+++ b/tests/unittest/compiler/test_pad_bmm_rrr_bias_with_cat.py
@@ -21,22 +21,32 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import shape_utils
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadBmmBiasWithCatTestCase(unittest.TestCase):
-    def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
+    def _test_pad_bmm_rrr_bias_with_cat(
+        self, test_name, bs, ms, n, k1, k2, dtype="float16"
+    ):
         k = k1 + k2
         b_dim = shape_utils.gen_int_var_min_max(bs, name="b")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
-        X1 = Tensor(shape=[b_dim, m_dim, k1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[b_dim, m_dim, k2], dtype="float16", name="x2", is_input=True)
+        X1 = Tensor(shape=[b_dim, m_dim, k1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[b_dim, m_dim, k2], dtype=dtype, name="x2", is_input=True)
         X4 = ops.concatenate()([X1, X2], dim=2)
 
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
-        W2 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b2", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b1", is_input=True)
+        W2 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b2", is_input=True)
         Y1 = ops.bmm_rrr_add()(X4, W1, B1)
         Y2 = ops.bmm_rrr_add()(X4, W2, B2)
 
@@ -46,24 +56,24 @@ def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
 
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         module = compile_model(
-            [Y], target, "./tmp", f"test_bmm_rrr_padding_{test_name}"
+            [Y], target, "./tmp", f"test_bmm_rrr_padding_{test_name}_{dtype}"
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(b, m, k1).cuda().half()
-            X2_pt = torch.randn(b, m, k2).cuda().half()
+            X1_pt = get_random_torch_tensor([b, m, k1], dtype)
+            X2_pt = get_random_torch_tensor([b, m, k2], dtype)
             X4_pt = torch.cat([X1_pt, X2_pt], dim=2)
 
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, m, n).cuda().half()
-            W2_pt = torch.randn(b, k, n).cuda().half()
-            B2_pt = torch.randn(b, m, n).cuda().half()
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, m, n], dtype)
+            W2_pt = get_random_torch_tensor([b, k, n], dtype)
+            B2_pt = get_random_torch_tensor([b, m, n], dtype)
 
             Y1_pt = torch.baddbmm(B1_pt, X4_pt, W1_pt)
             Y2_pt = torch.baddbmm(B2_pt, X4_pt, W2_pt)
@@ -78,11 +88,11 @@ def _test_pad_bmm_rrr_bias_with_cat(self, test_name, bs, ms, n, k1, k2):
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["b2"]] = B2_pt
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_pad_bmm_rrr_bias_with_cat(self):
+    def test_pad_bmm_rrr_bias_with_cat_float16(self):
         self._test_pad_bmm_rrr_bias_with_cat(
             "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10
         )
@@ -93,6 +103,24 @@ def test_pad_bmm_rrr_bias_with_cat(self):
             "dynamic_odd_kn", bs=[1, 2, 3], ms=[2, 5, 7], n=15, k1=1, k2=2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_pad_bmm_rrr_bias_with_cat_float32_sm80(self):
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "static_odd_k", bs=[2], ms=[64], n=32, k1=3, k2=10, dtype="float32"
+        )
+        self._test_pad_bmm_rrr_bias_with_cat(
+            "dynamic_odd_kn",
+            bs=[1, 2, 3],
+            ms=[2, 5, 7],
+            n=15,
+            k1=1,
+            k2=2,
+            dtype="float32",
+        )
+
+
+filter_test_cases_by_test_env(PadBmmBiasWithCatTestCase)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
index ac91d3fbe..2cfd80ae1 100644
--- a/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_rrr_with_cat.py
@@ -20,17 +20,25 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import shape_utils
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadGemmWithCatTestCase(unittest.TestCase):
-    def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
+    def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2, dtype="float16"):
         k = k1 + k2
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
-        X1 = Tensor(shape=[m_dim, k1], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[k, n], dtype="float16", name="w1", is_input=True)
-        X2 = Tensor(shape=[m_dim, k2], dtype="float16", name="x2", is_input=True)
-        W2 = Tensor(shape=[k, n], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[m_dim, k1], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[k, n], dtype=dtype, name="w1", is_input=True)
+        X2 = Tensor(shape=[m_dim, k2], dtype=dtype, name="x2", is_input=True)
+        W2 = Tensor(shape=[k, n], dtype=dtype, name="w2", is_input=True)
         X4 = ops.concatenate()([X1, X2], dim=1)
         Y1 = ops.gemm_rrr()(X4, W1)
         Y2 = ops.gemm_rrr()(X4, W2)
@@ -40,21 +48,21 @@ def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
 
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         dll_name = f"test_rrr_padding_{test_name}.so"
         module = compile_model(
-            [Y], target, "./tmp", "pad_gemm_with_cat_rrr", dll_name=dll_name
+            [Y], target, "./tmp", f"pad_gemm_with_cat_rrr_{dtype}", dll_name=dll_name
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
 
         for m in ms:
-            X1_pt = torch.randn(m, k1).cuda().half()
-            W1_pt = torch.randn(k, n).cuda().half()
-            X2_pt = torch.randn(m, k2).cuda().half()
-            W2_pt = torch.randn(k, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k1], dtype)
+            W1_pt = get_random_torch_tensor([k, n], dtype)
+            X2_pt = get_random_torch_tensor([m, k2], dtype)
+            W2_pt = get_random_torch_tensor([k, n], dtype)
             X4_pt = torch.cat([X1_pt, X2_pt], dim=1)
             Y1_pt = torch.matmul(X4_pt, W1_pt)
             Y2_pt = torch.matmul(X4_pt, W2_pt)
@@ -66,17 +74,33 @@ def _test_pad_gemm_rrr_with_cat(self, test_name, ms, n, k1, k2):
             inputs[name_to_idx["x2"]] = X2_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["w2"]] = W2_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_pad_gemm_rrr_with_cat(self):
+    def test_pad_gemm_rrr_with_cat_float16(self):
         self._test_pad_gemm_rrr_with_cat("static_odd_k", ms=[128], n=32, k1=3, k2=10)
         self._test_pad_gemm_rrr_with_cat("static_odd_kn", ms=[128], n=31, k1=1, k2=8)
         self._test_pad_gemm_rrr_with_cat(
             "dynamic_odd_kn", ms=[2, 5, 7], n=15, k1=1, k2=2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_pad_gemm_rrr_with_cat_float32_sm80(self):
+        self._test_pad_gemm_rrr_with_cat(
+            "static_odd_k", ms=[128], n=32, k1=3, k2=10, dtype="float32"
+        )
+        self._test_pad_gemm_rrr_with_cat(
+            "dynamic_odd_kn",
+            ms=[2, 5, 7],
+            n=15,
+            k1=1,
+            k2=2,
+            dtype="float32",
+        )
+
+
+filter_test_cases_by_test_env(PadGemmWithCatTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_pad_gemm_with_cat.py b/tests/unittest/compiler/test_pad_gemm_with_cat.py
index b5cf0ffa6..4eccd457b 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_cat.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_cat.py
@@ -22,24 +22,46 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class PadGemmWithCatTestCase(unittest.TestCase):
-    def test_pad_gemm_rcr_with_cat(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_pad_gemm_rcr_with_cat(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 128
         N = 32
         K1 = 3
         K2 = 10
         K = K1 + K2
 
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[N, K], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[N, K], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
 
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W2 = Tensor(shape=[N, K], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W2 = Tensor(shape=[N, K], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
 
         X3 = ops.elementwise(FuncEnum.ADD)(X1, X1)
         X4 = ops.concatenate()([X2, X3], dim=1)
@@ -49,21 +71,20 @@ def test_pad_gemm_rcr_with_cat(self):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
+            _LOGGER.warning("Skip this test on SM75")
             return
         dll_name = "test_rcr.so"
         module = compile_model(
-            [Y], target, "./tmp", "pad_gemm_with_cat", dll_name=dll_name
+            [Y], target, "./tmp", f"pad_gemm_with_cat_{dtype}", dll_name=dll_name
         )
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N, K).cuda().half()
-        W2_pt = torch.randn(N, K).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
+        X1_pt = get_random_torch_tensor([M, K1], dtype)
+        X2_pt = get_random_torch_tensor([M, K2], dtype)
+        W1_pt = get_random_torch_tensor([N, K], dtype)
+        W2_pt = get_random_torch_tensor([N, K], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
         X3_pt = torch.add(X1_pt, X1_pt)
         X4_pt = torch.cat([X2_pt, X3_pt], dim=1)
         X5_pt = torch.nn.functional.linear(X4_pt, W1_pt, bias=B1_pt)
@@ -71,7 +92,7 @@ def test_pad_gemm_rcr_with_cat(self):
         Y_pt = torch.cat([X5_pt, X6_pt], dim=1)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = [0] * 6
@@ -85,7 +106,7 @@ def test_pad_gemm_rcr_with_cat(self):
         inputs[name_to_idx["b1"]] = B1_pt
         inputs[name_to_idx["b2"]] = B2_pt
 
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
diff --git a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
index d4991cde2..ea905af78 100644
--- a/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
+++ b/tests/unittest/compiler/test_pad_gemm_with_elementwise.py
@@ -21,25 +21,44 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
 from parameterized import param, parameterized
 
 
 class PadGemmWithElementwise(unittest.TestCase):
     @parameterized.expand(
-        [
-            param("static_M", [23], 7, 3),
-            param("dynamic_M", [1, 78, 99], 7, 3),
-        ]
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param("static_M_float16", [23], 7, 3, "float16"),
+                    param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
+                ],
+            }
+        )
     )
-    def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k):
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise(
+        self, test_name, ms, n, k, dtype
+    ):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
-        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
-        S2 = Tensor(shape=[m_dim, n], dtype="float16", name="s2", is_input=True)
+        X1 = Tensor(shape=[m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype=dtype, name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype=dtype, name="s1", is_input=True)
+        S2 = Tensor(shape=[m_dim, n], dtype=dtype, name="s2", is_input=True)
 
         X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S2)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -47,17 +66,16 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_gemm_with_elementwise_{test_name}"
         )
 
         for m in ms:
-            X1_pt = torch.randn(m, k).cuda().half()
-            W1_pt = torch.randn(n, k).cuda().half()
-            B1_pt = torch.randn(n).cuda().half()
-            S1_pt = torch.randn(m, n).cuda().half()
-            S2_pt = torch.randn(m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k], dtype)
+            W1_pt = get_random_torch_tensor([n, k], dtype)
+            B1_pt = get_random_torch_tensor([n], dtype)
+            S1_pt = get_random_torch_tensor([m, n], dtype)
+            S2_pt = get_random_torch_tensor([m, n], dtype)
 
             X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S2_pt
             Y_pt = X2_pt + X2_pt
@@ -69,25 +87,51 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise(self, test_name, ms, n, k)
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["s1"]] = S1_pt
             inputs[name_to_idx["s2"]] = S2_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            ("static_shape", [3], [1], 5, 3),
-            ("dynamic_M", [3], [1, 78, 99], 7, 3),
-            ("dynamic_B", [3, 5, 8], [3], 11, 15),
-            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
-        ]
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("static_shape_float16", [3], [1], 5, 3, "float16"),
+                    ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+                    ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+                    (
+                        "dynamic_BM_float16",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float16",
+                    ),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("static_shape_float32", [3], [1], 5, 3, "float32"),
+                    (
+                        "dynamic_BM_float32",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float32",
+                    ),
+                ],
+            }
+        )
     )
-    def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
+    def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         b_dim = shape_utils.gen_int_var_min_max(bs, "B")
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[b_dim, m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, m_dim, n], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[b_dim, m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, m_dim, n], dtype=dtype, name="b1", is_input=True)
 
         X2 = ops.bmm_rrr_add()(X1, W1, B1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -95,15 +139,14 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_bmm_with_elementwise_{test_name}"
         )
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(b, m, k).cuda().half()
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([b, m, k], dtype)
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, m, n], dtype)
 
             X2_pt = torch.matmul(X1_pt, W1_pt) + B1_pt
             Y_pt = X2_pt + X2_pt
@@ -113,26 +156,52 @@ def test_pad_bmm_rrr_add_with_elementwise(self, test_name, bs, ms, n, k):
             inputs[name_to_idx["x1"]] = X1_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            ("static_shape", [3], [1], 5, 3),
-            ("dynamic_M", [3], [1, 78, 99], 7, 3),
-            ("dynamic_B", [3, 5, 8], [3], 11, 15),
-            ("dynamic_BM", [3, 5, 8], [3, 9, 10], 17, 21),
-        ]
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("static_shape_float16", [3], [1], 5, 3, "float16"),
+                    ("dynamic_M_float16", [3], [1, 78, 99], 7, 3, "float16"),
+                    ("dynamic_B_float16", [3, 5, 8], [3], 11, 15, "float16"),
+                    (
+                        "dynamic_BM_float16",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float16",
+                    ),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("static_shape_float32", [3], [1], 5, 3, "float32"),
+                    (
+                        "dynamic_BM_float32",
+                        [3, 5, 8],
+                        [3, 9, 10],
+                        17,
+                        21,
+                        "float32",
+                    ),
+                ],
+            }
+        )
     )
-    def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
+    def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         b_dim = shape_utils.gen_int_var_min_max(bs, "B")
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
         # (M, B, K) * (B, K, N) = (M, B, N)
-        X1 = Tensor(shape=[m_dim, b_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[b_dim, k, n], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[b_dim, n], dtype="float16", name="b1", is_input=True)
+        X1 = Tensor(shape=[m_dim, b_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[b_dim, k, n], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[b_dim, n], dtype=dtype, name="b1", is_input=True)
 
         X2 = ops.perm102_bmm_rrr_bias()(X1, W1, B1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -140,15 +209,14 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_perm102_with_elementwise_{test_name}"
         )
 
         for b, m in itertools.product(bs, ms):
-            X1_pt = torch.randn(m, b, k).cuda().half()
-            W1_pt = torch.randn(b, k, n).cuda().half()
-            B1_pt = torch.randn(b, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, b, k], dtype)
+            W1_pt = get_random_torch_tensor([b, k, n], dtype)
+            B1_pt = get_random_torch_tensor([b, n], dtype)
             Bias_pt = B1_pt.unsqueeze(1)
 
             X2_pt = torch.permute(
@@ -161,24 +229,37 @@ def test_pad_perm102_bmm_rrr_with_elementwise(self, test_name, bs, ms, n, k):
             inputs[name_to_idx["x1"]] = X1_pt
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     @parameterized.expand(
-        [
-            param("static_M", [23], 7, 3),
-            param("dynamic_M", [1, 78, 99], 7, 3),
-        ]
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param("static_M_float16", [23], 7, 3, "float16"),
+                    param("dynamic_M_float16", [1, 78, 99], 7, 3, "float16"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param("dynamic_M_float32", [1, 78, 99], 7, 3, "float32"),
+                ],
+            }
+        )
     )
-    def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n, k):
+    def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(
+        self, test_name, ms, n, k, dtype
+    ):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         # S1 is fed to gemm twice
         m_dim = shape_utils.gen_int_var_min_max(ms, "M")
 
-        X1 = Tensor(shape=[m_dim, k], dtype="float16", name="x1", is_input=True)
-        W1 = Tensor(shape=[n, k], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[n], dtype="float16", name="b1", is_input=True)
-        S1 = Tensor(shape=[m_dim, n], dtype="float16", name="s1", is_input=True)
+        X1 = Tensor(shape=[m_dim, k], dtype=dtype, name="x1", is_input=True)
+        W1 = Tensor(shape=[n, k], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[n], dtype=dtype, name="b1", is_input=True)
+        S1 = Tensor(shape=[m_dim, n], dtype=dtype, name="s1", is_input=True)
 
         X2 = ops.gemm_rcr_bias_mul_add()(X1, W1, B1, S1, S1)
         Y = ops.elementwise(FuncEnum.ADD)(X2, X2)
@@ -186,16 +267,15 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n,
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y], target, "./tmp", f"pad_gemm_with_elementwise_2_{test_name}"
         )
 
         for m in ms:
-            X1_pt = torch.randn(m, k).cuda().half()
-            W1_pt = torch.randn(n, k).cuda().half()
-            B1_pt = torch.randn(n).cuda().half()
-            S1_pt = torch.randn(m, n).cuda().half()
+            X1_pt = get_random_torch_tensor([m, k], dtype)
+            W1_pt = get_random_torch_tensor([n, k], dtype)
+            B1_pt = get_random_torch_tensor([n], dtype)
+            S1_pt = get_random_torch_tensor([m, n], dtype)
 
             X2_pt = torch.nn.functional.linear(X1_pt, W1_pt, B1_pt) * S1_pt + S1_pt
             Y_pt = X2_pt + X2_pt
@@ -206,7 +286,7 @@ def test_pad_gemm_rcr_bias_broadcast_with_elementwise_2(self, test_name, ms, n,
             inputs[name_to_idx["w1"]] = W1_pt
             inputs[name_to_idx["b1"]] = B1_pt
             inputs[name_to_idx["s1"]] = S1_pt
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
diff --git a/tests/unittest/compiler/test_parallel_gemm_fusions.py b/tests/unittest/compiler/test_parallel_gemm_fusions.py
index 3108256ff..929d06150 100644
--- a/tests/unittest/compiler/test_parallel_gemm_fusions.py
+++ b/tests/unittest/compiler/test_parallel_gemm_fusions.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 from typing import Sequence
@@ -24,28 +25,37 @@
 from aitemplate.compiler.transform.toposort import toposort
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import count_ops, has_op
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    count_ops,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
+from aitemplate.utils import graph_utils
 
 
-def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
+_LOGGER = logging.getLogger(__name__)
+
+
+def _prepare_input_tensors(m, nk_groups, dtype, start=0, has_bias=True):
     inputs = []
     batch_dim = IntImm(m)
     for i, (n, k) in enumerate(nk_groups):
         X = Tensor(
             shape=[batch_dim, IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="x_{}".format(i + start),
             is_input=True,
         )
         W = Tensor(
             shape=[IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="w_{}".format(i + start),
         )
         B = Tensor(
             shape=[IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="b_{}".format(i + start),
         )
         if has_bias:
@@ -55,14 +65,14 @@ def _prepare_input_tensors(m, nk_groups, start=0, has_bias=True):
     return inputs
 
 
-def _prepare_inputs_and_constants(m, nk_groups, start=0, has_bias=True):
+def _prepare_inputs_and_constants(m, nk_groups, dtype, start=0, has_bias=True):
     inputs = []
     constants = {}
 
     for i, (n, k) in enumerate(nk_groups):
-        x_pt = torch.randn(m, k).half().cuda()
-        w_pt = torch.randn(n, k).half().cuda()
-        b_pt = torch.randn(n).half().cuda()
+        x_pt = get_random_torch_tensor([m, k], dtype)
+        w_pt = get_random_torch_tensor([n, k], dtype)
+        b_pt = get_random_torch_tensor([n], dtype)
 
         inputs.append(x_pt)
         constants[f"w_{i}"] = w_pt
@@ -72,7 +82,7 @@ def _prepare_inputs_and_constants(m, nk_groups, start=0, has_bias=True):
     return inputs, constants
 
 
-def _prepare_outputs(output_tensors):
+def _prepare_outputs(output_tensors, dtype):
     def _to_int_list(shape):
         result = []
         for d in shape:
@@ -81,12 +91,12 @@ def _to_int_list(shape):
         return result
 
     output_shapes = [_to_int_list(output._attrs["shape"]) for output in output_tensors]
-    outputs = [torch.empty(shape).half().cuda() for shape in output_shapes]
+    outputs = [get_torch_empty_tensor(shape, dtype) for shape in output_shapes]
     return outputs
 
 
-def _prepare_ait_module(m, nk_groups, constants, test_idx=0, has_bias=True):
-    group_input_tensors = _prepare_input_tensors(m, nk_groups, has_bias=has_bias)
+def _prepare_ait_module(m, nk_groups, constants, dtype, test_idx=0, has_bias=True):
+    group_input_tensors = _prepare_input_tensors(m, nk_groups, dtype, has_bias=has_bias)
     output_tensors = []
     for group in group_input_tensors:
         group[0] = ops.elementwise(FuncEnum.TANH)(group[0])
@@ -102,10 +112,11 @@ def _prepare_ait_module(m, nk_groups, constants, test_idx=0, has_bias=True):
         Y,
         target,
         "./tmp",
-        f"test_multi_parallel_gemm_cat_groups_{test_idx}",
+        f"test_multi_parallel_gemm_cat_groups_{dtype}",
+        dll_name=f"test_{test_idx}.so",
         constants=constants,
     )
-    outputs = _prepare_outputs([Y])
+    outputs = _prepare_outputs([Y], dtype)
     return outputs, module
 
 
@@ -116,21 +127,20 @@ def __init__(self, *args, **kwargs):
         self._test_id = 0
 
     def _fuse_2_split_parallel_gemm_cat(
-        self, b: int, ms: Sequence[int], n: int, k: int
+        self, b: int, ms: Sequence[int], n: int, k: int, dtype: str = "float16"
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"_fuse_2_split_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}",
         )
         X1 = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -139,14 +149,14 @@ def _fuse_2_split_parallel_gemm_cat(
         for i in range(2 * b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
                 is_input=True,
             )
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
                 is_input=True,
             )
@@ -184,13 +194,12 @@ def _fuse_parallel_gemm_cat(
         perm102_bmm_op: str,
         has_tanh: bool = True,
         reshape_weight: bool = False,
+        dtype: str = "float16",
     ):
-        logger.info(
-            __file__, f"_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}"
-        )
+        _LOGGER.info(f"_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}")
         X = Tensor(
             shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -199,7 +208,7 @@ def _fuse_parallel_gemm_cat(
         for i in range(b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
             )
             if reshape_weight:
@@ -207,7 +216,7 @@ def _fuse_parallel_gemm_cat(
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
             )
             Bs.append(B)
@@ -225,8 +234,8 @@ def _fuse_parallel_gemm_cat(
 
         constants = {}
         for i in range(b):
-            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
-            constants[f"B{i}"] = torch.randn(n).cuda().half()
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
 
         # Gen module.
         target = detect_target()
@@ -234,7 +243,8 @@ def _fuse_parallel_gemm_cat(
             [cat_output],
             target,
             "./tmp",
-            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
             constants=constants,
         ) as module:
             self._test_id += 1
@@ -243,14 +253,14 @@ def _fuse_parallel_gemm_cat(
             sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
             assert has_op(
                 sorted_ops, perm102_bmm_op
-            ), "the final graph does not have op perm102_bmm_rrr_bias"
+            ), f"the final graph does not have op {perm102_bmm_op}"
             if not has_tanh:
                 assert not has_op(
                     sorted_ops, "split"
                 ), "the final graph has split op, but it should not"
 
             for m in ms:
-                x_pt = torch.randn(m, b * k).cuda().half()
+                x_pt = get_random_torch_tensor([m, b * k], dtype)
                 x1_pt = torch.split(x_pt, k, dim=-1)
 
                 cat_inputs_pt = []
@@ -264,16 +274,16 @@ def _fuse_parallel_gemm_cat(
 
                 # Run AITemplate module.
 
-                out = torch.empty([m, b * n]).cuda().half()
+                out = get_torch_empty_tensor([m, b * n], dtype)
                 module.run_with_tensors([x_pt], [out])
                 # module.benchmark_with_tensors([x_pt], [out])
 
                 # Do comparisons.
                 self.assertTrue(
-                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                    torch.allclose(out, cat_output_pt, atol=5e-2, rtol=5e-2)
                 )
 
-    def test_fuse_parallel_gemm_cat(self):
+    def test_fuse_parallel_gemm_cat_fp16(self):
         # test n x gemms + cat
         self._fuse_parallel_gemm_cat(
             b=4, ms=[256, 512], n=128, k=64, perm102_bmm_op="perm102_bmm_rrr_bias"
@@ -333,6 +343,60 @@ def test_fuse_parallel_gemm_cat(self):
         # test multiple split + n x gemms + cat
         self._fuse_2_split_parallel_gemm_cat(b=4, ms=[256, 512], n=128, k=64)
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_fuse_parallel_gemm_cat_fp32_sm80(self):
+        # test n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[256, 512],
+            n=128,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            reshape_weight=True,
+            dtype="float32",
+        )
+
+        # test split + n x gemms + cat
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[256, 512],
+            n=32,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+            has_tanh=False,
+            dtype="float32",
+        )
+        self._fuse_parallel_gemm_cat(
+            b=4,
+            ms=[128, 256],
+            n=10,
+            k=32,
+            perm102_bmm_op="perm102_bmm_rcr_bias",
+            has_tanh=False,
+            dtype="float32",
+        )
+
+        # test multiple split + n x gemms + cat
+        self._fuse_2_split_parallel_gemm_cat(
+            b=4, ms=[256, 512], n=128, k=64, dtype="float32"
+        )
+
     def _test_fuse_parallel_gemm_cat_partial(
         self,
         b1: int,
@@ -341,22 +405,22 @@ def _test_fuse_parallel_gemm_cat_partial(
         n: int,
         k: int,
         has_tanh: bool = True,
+        dtype: str = "float16",
     ):
-        logger.info(
-            __file__,
+        _LOGGER.info(
             f"_fuse_parallel_gemm_cat_partial, b1: {b1}, b2: {b2}, ms: {ms}, n: {n}, k: {k}",
         )
         batch_dim = IntVar(ms, "input_batch")
         b = b1 + b2
         X1 = Tensor(
             shape=[batch_dim, IntImm(b1 * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[batch_dim, IntImm(b2 * k)],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -365,13 +429,13 @@ def _test_fuse_parallel_gemm_cat_partial(
         for i in range(b):
             W = Tensor(
                 shape=[IntImm(n), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"W{i}",
             )
             Ws.append(W)
             B = Tensor(
                 shape=[IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name=f"B{i}",
             )
             Bs.append(B)
@@ -387,12 +451,12 @@ def _test_fuse_parallel_gemm_cat_partial(
         X7 = ops.reshape()(X1, [-1, b1, k])
         W = Tensor(
             shape=[IntImm(b1), IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W",
         )
         B = Tensor(
             shape=[IntImm(b1), IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="B",
         )
         WT = ops.permute021()(W)
@@ -426,11 +490,11 @@ def _test_fuse_parallel_gemm_cat_partial(
 
         constants = {}
         for i in range(b):
-            constants[f"W{i}"] = torch.randn(n, k).cuda().half()
-            constants[f"B{i}"] = torch.randn(n).cuda().half()
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
 
-        constants["W"] = torch.randn(b1, n, k).cuda().half()
-        constants["B"] = torch.randn(b1, n).cuda().half()
+        constants["W"] = get_random_torch_tensor([b1, n, k], dtype)
+        constants["B"] = get_random_torch_tensor([b1, n], dtype)
 
         # Gen module.
         target = detect_target()
@@ -438,7 +502,8 @@ def _test_fuse_parallel_gemm_cat_partial(
             [cat_output],
             target,
             "./tmp",
-            f"_fuse_parallel_gemm_cat_{self._test_id}",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
             constants=constants,
         ) as module:
             self._test_id += 1
@@ -454,7 +519,7 @@ def _test_fuse_parallel_gemm_cat_partial(
                 ), "the final graph has split op, but it should not"
 
             for m in ms:
-                x_pt = torch.randn(m, b1 * k).cuda().half()
+                x_pt = get_random_torch_tensor([m, b1 * k], dtype)
                 x1_pt = torch.split(x_pt, k, dim=-1)
 
                 cat_inputs_pt = []
@@ -477,7 +542,7 @@ def _test_fuse_parallel_gemm_cat_partial(
                 cat_inputs_pt.append(x8_pt)
                 cat_inputs_pt.append(x9_pt)
 
-                xx_pt = torch.randn(m, b2 * k).cuda().half()
+                xx_pt = get_random_torch_tensor([m, b2 * k], dtype)
                 x2_pt = torch.split(xx_pt, k, dim=-1)
                 for i in range(b2):
                     x3_pt = x2_pt[i].tanh() if has_tanh else x2_pt[i]
@@ -490,24 +555,35 @@ def _test_fuse_parallel_gemm_cat_partial(
 
                 # Run AITemplate module.
 
-                out = torch.empty(cat_output_pt.size()).cuda().half()
+                out = get_torch_empty_tensor(cat_output_pt.size(), dtype)
                 module.run_with_tensors({"X1": x_pt, "X2": xx_pt}, {"output0": out})
 
                 # Do comparisons.
                 self.assertTrue(
-                    torch.allclose(out, cat_output_pt, atol=1e-2, rtol=1e-2)
+                    torch.allclose(out, cat_output_pt, atol=5e-2, rtol=5e-2)
                 )
 
-    def test_fuse_parallel_gemm_cat_partial(self):
+    def test_fuse_parallel_gemm_cat_partial_fp16(self):
         self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, True)
         self._test_fuse_parallel_gemm_cat_partial(4, 4, [128, 256], 32, 64, False)
         self._test_fuse_parallel_gemm_cat_partial(3, 3, [128, 256], 30, 66, True)
         self._test_fuse_parallel_gemm_cat_partial(2, 2, [128, 256], 33, 55, True)
 
-    def _test_multi_parallel_gemm_cat_groups(self, m, nk_groups, num_unfused_ops=0):
-        inputs, constants = _prepare_inputs_and_constants(m, nk_groups)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_fuse_parallel_gemm_cat_partial_fp32_sm80(self):
+        self._test_fuse_parallel_gemm_cat_partial(
+            4, 4, [128, 256], 32, 64, True, dtype="float32"
+        )
+        self._test_fuse_parallel_gemm_cat_partial(
+            4, 4, [128, 256], 32, 64, False, dtype="float32"
+        )
+
+    def _test_multi_parallel_gemm_cat_groups(
+        self, m, nk_groups, num_unfused_ops=0, dtype="float16"
+    ):
+        inputs, constants = _prepare_inputs_and_constants(m, nk_groups, dtype)
         outputs, module = _prepare_ait_module(
-            m, nk_groups, constants, test_idx=self._test_id
+            m, nk_groups, constants, dtype, test_idx=self._test_id
         )
         self._test_id += 1
         with module:
@@ -526,9 +602,9 @@ def _test_multi_parallel_gemm_cat_groups(self, m, nk_groups, num_unfused_ops=0):
                 ys.append(y)
             pt_y = torch.cat(ys, dim=-1)
             module.run_with_tensors(inputs, outputs)
-            self.assertTrue(torch.allclose(pt_y, outputs[0], atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.allclose(pt_y, outputs[0], atol=5e-2, rtol=5e-2))
 
-    def test_multi_parallel_gemm_cat_groups(self):
+    def test_multi_parallel_gemm_cat_groups_fp16(self):
         self._test_multi_parallel_gemm_cat_groups(
             256,
             [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
@@ -537,6 +613,127 @@ def test_multi_parallel_gemm_cat_groups(self):
             256, [[128, 64]] * 2 + [[128, 120]] + [[128, 72]] * 2 + [[128, 64]], 2
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_multi_parallel_gemm_cat_groups_fp32_sm80(self):
+        self._test_multi_parallel_gemm_cat_groups(
+            256,
+            [[128, 64]] * 2 + [[128, 120]] * 4 + [[128, 72]] * 2 + [[128, 64]] * 2,
+            dtype="float32",
+        )
+
+    def _skip_fuse_parallel_gemm_output_cat(
+        self,
+        b: int,
+        ms: Sequence[int],
+        n: int,
+        k: int,
+        perm102_bmm_op: str,
+        dtype: str = "float16",
+    ):
+        _LOGGER.info(f"_skip_fuse_parallel_gemm_cat, b: {b}, ms: {ms}, n: {n}, k: {k}")
+        X = Tensor(
+            shape=[IntVar(ms, "input_batch"), IntImm(b * k)],
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        Ws = []
+        Bs = []
+        for i in range(b):
+            W = Tensor(
+                shape=[IntImm(n), IntImm(k)],
+                dtype=dtype,
+                name=f"W{i}",
+            )
+
+            Ws.append(W)
+            B = Tensor(
+                shape=[IntImm(n)],
+                dtype=dtype,
+                name=f"B{i}",
+            )
+            Bs.append(B)
+
+        X1 = ops.split()(X, k, dim=-1)
+        cat_inputs = []
+        for i in range(b):
+            X2 = X1[i]
+            X3 = ops.gemm_rcr_bias()(X2, Ws[i], Bs[i])
+            cat_inputs.append(X3)
+            X3._attrs["name"] = f"output{i+1}"
+            X3._attrs["is_output"] = True
+
+        cat_output = ops.concatenate()(cat_inputs, dim=-1)
+
+        cat_output._attrs["name"] = "output0"
+        cat_output._attrs["is_output"] = True
+
+        constants = {}
+        for i in range(b):
+            constants[f"W{i}"] = get_random_torch_tensor([n, k], dtype)
+            constants[f"B{i}"] = get_random_torch_tensor([n], dtype)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [cat_output, *cat_inputs],
+            target,
+            "./tmp",
+            f"fuse_parallel_gemm_cat_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
+            constants=constants,
+        ) as module:
+            self._test_id += 1
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            assert not has_op(
+                sorted_ops, perm102_bmm_op
+            ), f"the final graph has op {perm102_bmm_op}"
+            assert has_op(
+                sorted_ops, "gemm_rcr_bias"
+            ), "the final graph does not have op gemm_rcr_bias"
+
+            for m in ms:
+                x_pt = get_random_torch_tensor([m, b * k], dtype)
+                x1_pt = torch.split(x_pt, k, dim=-1)
+
+                cat_inputs_pt = []
+                for i in range(b):
+                    x2_pt = x1_pt[i]
+                    x3_pt = torch.nn.functional.linear(
+                        x2_pt, constants[f"W{i}"], constants[f"B{i}"]
+                    )
+                    cat_inputs_pt.append(x3_pt)
+                cat_output_pt = (torch.cat(cat_inputs_pt, dim=-1), *cat_inputs_pt)
+
+                # Run AITemplate module.
+
+                cat_out = get_torch_empty_tensor([m, b * n], dtype)
+                out_other = [
+                    get_torch_empty_tensor(x.shape, dtype) for x in cat_inputs_pt
+                ]
+                out = [cat_out, *out_other]
+                module.run_with_tensors([x_pt], out)
+
+                # Do comparisons.
+                for (out_ait, out_pt) in zip(out, cat_output_pt):
+                    self.assertTrue(
+                        torch.allclose(out_ait, out_pt, atol=5e-2, rtol=5e-2)
+                    )
+
+    def test_skip_parallel_gemm_cat_groups(self):
+        self._skip_fuse_parallel_gemm_output_cat(
+            b=4,
+            ms=[256, 512],
+            n=128,
+            k=64,
+            perm102_bmm_op="perm102_bmm_rrr_bias",
+        )
+
+
+filter_test_cases_by_test_env(ParallelGemmCatFusionTestCase)
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_permute_bmm_special_op.py b/tests/unittest/compiler/test_permute_bmm_special_op.py
index ef47daaef..719500003 100644
--- a/tests/unittest/compiler/test_permute_bmm_special_op.py
+++ b/tests/unittest/compiler/test_permute_bmm_special_op.py
@@ -20,20 +20,20 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 class FusePermuteBmmRRRN1Case(unittest.TestCase):
-    def _test_permute_bmm_rrr_n1(self, B, M, K, testname):
+    def _test_permute_bmm_rrr_n1(self, B, M, K, testname, dtype="float16"):
         N = 1
 
         batch_dim = shape_utils.gen_int_var_min_max(B)
-        X = Tensor(
-            shape=[batch_dim, M, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
-        )
+        X = Tensor(shape=[batch_dim, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
 
         WT = ops.permute021()(W)
 
@@ -59,20 +59,26 @@ def _test_permute_bmm_rrr_n1(self, B, M, K, testname):
         assert src_op._attrs["op"] == "bmm_rcr_n1"
 
         for b in B:
-            X_pt = torch.randn(b, M, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+            X_pt = get_random_torch_tensor([b, M, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_pt = torch.cos(torch.bmm(X_pt, W_pt))
             w = W_pt.permute([0, 2, 1]).contiguous()
 
             # We currently only have row-major outputs.
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": w}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_permute_bmm_rrr_n1(self):
-        self._test_permute_bmm_rrr_n1([1], 4, 8, "permute_bmm_rrr_n1")
-        self._test_permute_bmm_rrr_n1([1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic")
+    def test_permute_bmm_rrr_n1_fp16(self):
+        self._test_permute_bmm_rrr_n1([1], 4, 8, "permute_bmm_rrr_n1_fp16")
+        self._test_permute_bmm_rrr_n1([1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic_fp16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_permute_bmm_rrr_n1_fp32(self):
+        self._test_permute_bmm_rrr_n1(
+            [1, 3], 4, 8, "permute_bmm_rrr_n1_dynamic_fp32", dtype="float32"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_refine_graph.py b/tests/unittest/compiler/test_refine_graph.py
index 83367776d..856ae5687 100644
--- a/tests/unittest/compiler/test_refine_graph.py
+++ b/tests/unittest/compiler/test_refine_graph.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 
+import logging
 import unittest
 
 import torch
@@ -21,28 +22,51 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import graph_utils, logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from aitemplate.utils import graph_utils
+
+from parameterized import parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class RefineGraphTestCase(unittest.TestCase):
-    def test_elementwise_ops(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_elementwise_ops(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and target.name == "rocm":
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         X2 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -53,12 +77,11 @@ def test_elementwise_ops(self):
         Y0._attrs["is_output"] = True
         Y1._attrs["name"] = "Y1"
         Y1._attrs["is_output"] = True
-        target = detect_target()
         module = compile_model(
             [Y0, Y1],
             target,
             "./tmp",
-            "test_refine_graph_elementwise",
+            f"test_refine_graph_elementwise_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
@@ -67,17 +90,18 @@ def test_elementwise_ops(self):
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
     def test_elementwise_ops_single_input_no_refine(self):
+        dtype = "float16"
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -102,17 +126,18 @@ def test_elementwise_ops_single_input_no_refine(self):
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
     def test_elementwise_ops_single_input(self):
+        dtype = "float16"
         M = 10
         N = 4
         X0 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X0",
             is_input=True,
         )
         X1 = Tensor(
             shape=[M, N],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -138,10 +163,10 @@ def test_elementwise_ops_single_input(self):
 
         inputs = {}
         outputs = {}
-        inputs["X0"] = torch.randn([M, N]).cuda().half()
-        inputs["X1"] = torch.randn([M, N]).cuda().half()
-        outputs["Y0"] = torch.empty([M, N]).cuda().half()
-        outputs["Y1"] = torch.empty([M, N]).cuda().half()
+        inputs["X0"] = get_random_torch_tensor([M, N], dtype)
+        inputs["X1"] = get_random_torch_tensor([M, N], dtype)
+        outputs["Y0"] = get_torch_empty_tensor([M, N], dtype)
+        outputs["Y1"] = get_torch_empty_tensor([M, N], dtype)
 
         module.run_with_tensors(inputs, outputs)
         y0 = torch.nn.functional.silu(inputs["X0"])
@@ -150,19 +175,19 @@ def test_elementwise_ops_single_input(self):
         self.assertTrue(torch.allclose(y0, outputs["Y0"], 1e-2, 1e-2))
         self.assertTrue(torch.allclose(y1, outputs["Y1"], 1e-2, 1e-2))
 
-    def _build_gemm_rcr_bias(self, M, N, K, start_idx=0):
+    def _build_gemm_rcr_bias(self, M, N, K, dtype, start_idx=0):
         X_shape = [M, K]
         W_shape = [N, K]
         B_shape = [N]
 
         input_0 = Tensor(
-            shape=X_shape, dtype="float16", name=f"input_{start_idx}", is_input=True
+            shape=X_shape, dtype=dtype, name=f"input_{start_idx}", is_input=True
         )
         input_1 = Tensor(
-            shape=W_shape, dtype="float16", name=f"input_{start_idx + 1}", is_input=True
+            shape=W_shape, dtype=dtype, name=f"input_{start_idx + 1}", is_input=True
         )
         input_2 = Tensor(
-            shape=B_shape, dtype="float16", name=f"input_{start_idx + 2}", is_input=True
+            shape=B_shape, dtype=dtype, name=f"input_{start_idx + 2}", is_input=True
         )
 
         gemm_tensor = ops.gemm_universal.gemm_rcr()(input_0, input_1)
@@ -170,35 +195,46 @@ def _build_gemm_rcr_bias(self, M, N, K, start_idx=0):
 
         return bias_tensor
 
-    def _build_gemm_rcr_bias_mul(self, M, N, K, start_idx=0):
+    def _build_gemm_rcr_bias_mul(self, M, N, K, dtype, start_idx=0):
         D_shape = [M, N]
         input_3 = Tensor(
-            shape=D_shape, dtype="float16", name=f"input_{start_idx + 3}", is_input=True
+            shape=D_shape, dtype=dtype, name=f"input_{start_idx + 3}", is_input=True
         )
 
-        bias_tensor = self._build_gemm_rcr_bias(M, N, K, start_idx)
+        bias_tensor = self._build_gemm_rcr_bias(M, N, K, dtype, start_idx)
         mul_tensor = ops.elementwise(FuncEnum.MUL)(bias_tensor, input_3)
 
         return mul_tensor
 
-    def test_gemm_ops(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_gemm_ops(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 128
         N = 64
         K = 256
 
-        Y1 = self._build_gemm_rcr_bias_mul(M, N, K, 0)
-        Y2 = self._build_gemm_rcr_bias_mul(M, N, K, 4)
+        Y1 = self._build_gemm_rcr_bias_mul(M, N, K, dtype, 0)
+        Y2 = self._build_gemm_rcr_bias_mul(M, N, K, dtype, 4)
         Y1._attrs["name"] = "Y0"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "Y1"
         Y2._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             [Y1, Y2],
             target,
             "./tmp",
-            "test_refine_graph_gemm",
+            f"test_refine_graph_gemm_{dtype}",
         )
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
@@ -206,8 +242,19 @@ def test_gemm_ops(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] == sorted_ops[1]._attrs["name"]
 
-    def test_bmm_ops_accessor(self):
-        dtype = "float16"
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_bmm_ops_accessor(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         B = 16
         M = 128
         K = 64
@@ -239,12 +286,11 @@ def test_bmm_ops_accessor(self):
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
         module = compile_model(
             Y,
             target,
             "./tmp",
-            "test_refine_graph_bmm",
+            f"test_refine_graph_bmm_{dtype}",
         )
 
         debug_sorted_graph = module.debug_sorted_graph
@@ -253,20 +299,31 @@ def test_bmm_ops_accessor(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-    def test_refine_graph_group_gemms(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_refine_graph_group_gemms(self, dtype):
+        target = detect_target()
+        if dtype == "float32" and (int(target._arch) < 80 or target.name == "rocm"):
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
+
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
-        target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         Y1, Y2 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
         Y3, Y4 = ops.group_gemm_rcr()(operand_groups=[[X1, W1], [X2, W2]])
         Y1._attrs["name"] = "y1"
@@ -281,7 +338,7 @@ def test_refine_graph_group_gemms(self):
         graph_outputs = [Y1, Y2, Y3, Y4]
 
         module = compile_model(
-            graph_outputs, target, "./tmp", "test_refine_graph_group_gemms"
+            graph_outputs, target, "./tmp", f"test_refine_graph_group_gemms_{dtype}"
         )
 
         debug_sorted_graph = module.debug_sorted_graph
@@ -289,10 +346,10 @@ def test_refine_graph_group_gemms(self):
         assert len(sorted_ops) == 2
         assert sorted_ops[0]._attrs["name"] != sorted_ops[1]._attrs["name"]
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor([M, K1], dtype)
+        X2_pt = get_random_torch_tensor([M, K2], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
 
@@ -302,10 +359,10 @@ def test_refine_graph_group_gemms(self):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
-        y3 = torch.empty([M, N1]).cuda().half()
-        y4 = torch.empty([M, N2]).cuda().half()
+        y1 = get_torch_empty_tensor([M, N1], dtype)
+        y2 = get_torch_empty_tensor([M, N2], dtype)
+        y3 = get_torch_empty_tensor([M, N1], dtype)
+        y4 = get_torch_empty_tensor([M, N2], dtype)
         outputs = {"y1": y1, "y2": y2, "y3": y3, "y4": y4}
 
         module.run_with_tensors(inputs, outputs)
diff --git a/tests/unittest/compiler/test_remove_elementwise_no_ops.py b/tests/unittest/compiler/test_remove_elementwise_no_ops.py
new file mode 100644
index 000000000..ae7e3b414
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_elementwise_no_ops.py
@@ -0,0 +1,152 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Callable
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.remove_elementwise_no_ops import (
+    remove_elementwise_no_ops,
+)
+from aitemplate.compiler.transform.toposort import toposort
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class RemoveElementwiseNoOpsTestCase(unittest.TestCase):
+    def _test_remove_elementwise_op_impl(
+        self, elementwise_op_getter: Callable[[Tensor], Tensor], should_remove: bool
+    ) -> None:
+        batch_sizes = [1, 1024]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype="float16")
+        X2 = gen_input_tensor([batch_dim, IntImm(M)], name="x2", dtype="float16")
+        add_0 = elementwise_op_getter(X1)
+        Y = ops.elementwise(FuncEnum.ADD)(add_0, X2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        sorted_graph = toposort([Y])
+        modified_graph = remove_elementwise_no_ops(sorted_graph)
+        if should_remove:
+            self.assertEqual(len(modified_graph), len(sorted_graph) - 1)
+            self.assertTrue(add_0 in sorted_graph)
+            self.assertFalse(add_0 in modified_graph)
+        else:
+            self.assertEqual(sorted_graph, modified_graph)
+
+    def test_remove_elementwise_op(self) -> None:
+        test_cases = [
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 0), True),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(0, x), True),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 1), False),
+            (lambda x: ops.elementwise(FuncEnum.ADD)(1, x), False),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 0), True),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(0, x), False),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 1), False),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 1), True),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(1, x), True),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 2), False),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(2, x), False),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 1), True),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 2), False),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(1, x), False),
+        ]
+        for test_no, test in enumerate(test_cases):
+            with self.subTest(test_no=test_no):
+                self._test_remove_elementwise_op_impl(
+                    elementwise_op_getter=test[0], should_remove=test[1]
+                )
+
+    def test_not_remove_connecting_input_output(
+        self,
+    ):
+        batch_sizes = [1, 1024]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype="float16")
+        Y = ops.elementwise(FuncEnum.ADD)(X1, 0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        sorted_graph = toposort([Y])
+        modified_graph = remove_elementwise_no_ops(sorted_graph)
+        self.assertEqual(sorted_graph, modified_graph)
+
+
+class RemoveElementwiseNoOpsIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(RemoveElementwiseNoOpsIntegrationTest, self).__init__(*args, **kwargs)
+        torch.manual_seed(0)
+        self.BATCH_SIZES = [1, 218]
+        self.M = 10
+
+    def test_remove_elementwise_op(self) -> None:
+        test_cases = [
+            (lambda x: ops.elementwise(FuncEnum.ADD)(x, 0), lambda x: x + 0),
+            (lambda x: ops.elementwise(FuncEnum.SUB)(x, 0), lambda x: x - 0),
+            (lambda x: ops.elementwise(FuncEnum.MUL)(x, 1), lambda x: x * 1),
+            (lambda x: ops.elementwise(FuncEnum.DIV)(x, 1), lambda x: x * 1),
+        ]
+        for test_no, test in enumerate(test_cases):
+            with self.subTest(test_no=test_no):
+                self._test_remove_elementwise_no_ops_impl(
+                    elementwise_op_getter=test[0], expected_op=test[1]
+                )
+
+    def _test_remove_elementwise_no_ops_impl(
+        self,
+        elementwise_op_getter: Callable[[Tensor], Tensor],
+        expected_op: Callable[[Tensor], Tensor],
+    ):
+        dtype = "float16"
+        batch_dim = shape_utils.gen_int_var_min_max(self.BATCH_SIZES, "batch_0")
+        reduce_dim = 0
+        X0 = gen_input_tensor([batch_dim, IntImm(self.M)], name="x0", dtype=dtype)
+        elementwise_op_0 = elementwise_op_getter(X0)
+        Y = ops.reduce_mean(reduce_dim)(elementwise_op_0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(
+            Y,
+            detect_target(),
+            "./tmp",
+            "test_remove_elementwise_no_ops",
+        )
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+
+        for batch in self.BATCH_SIZES:
+            x0_pt = get_random_torch_tensor([batch, self.M], dtype)
+            add_0_pt = expected_op(x0_pt)
+            y_pt = torch.mean(add_0_pt, dim=reduce_dim)
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_remove_id_ops.py b/tests/unittest/compiler/test_remove_id_ops.py
new file mode 100644
index 000000000..02c7bb02e
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_id_ops.py
@@ -0,0 +1,355 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+class RemoveIdOpsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(RemoveIdOpsTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+        self.BATCH_SIZE = 1024
+
+    def test_remove_id_simple(
+        self,
+        test_name="remove_id_simple",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # y = add(x1, id_1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        Y = ops.elementwise(FuncEnum.ADD)(X1, id_1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            y_pt = x1_pt + id_1_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_id_simple_2(
+        self,
+        test_name="remove_id_simple_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # id_2 = id(x1)
+        # y = add(id_1, id_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(X1)
+        Y = ops.elementwise(FuncEnum.ADD)(id_1, id_2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            id_2_pt = x1_pt
+            y_pt = id_1_pt + id_2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_consecutive_ids_1(
+        self,
+        test_name="remove_consecutive_ids_1",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x0)
+        # id_1 = id(add_0)
+        # id_2 = id(id_1)
+        # id_3 = id(id_2)
+        # id_4 = id(id_1)
+        # add_1 = add(id_3, id_4)
+        # add_2 = add(id_1, id_4)
+        # y = add(add_1, add_2)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X0)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(id_1)
+        id_3 = ops.identity()(id_2)
+        id_4 = ops.identity()(id_1)
+        add_1 = ops.elementwise(FuncEnum.ADD)(id_3, id_4)
+        id_5 = ops.identity()(X1)
+        add_2 = ops.elementwise(FuncEnum.ADD)(id_5, id_2)
+        Y = ops.elementwise(FuncEnum.ADD)(add_1, add_2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x0_pt
+            id_1_pt = add_0_pt
+            id_2_pt = id_1_pt
+            id_3_pt = id_2_pt
+            id_4_pt = id_1_pt
+            add_1_pt = id_3_pt + id_4_pt
+            id_5_pt = x1_pt
+            add_2_pt = id_5_pt + id_2_pt
+            y_pt = add_1_pt + add_2_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_remove_consecutive_ids_2(
+        self,
+        test_name="remove_consecutive_ids_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x1)
+        # id_1 = id(add_0)
+        # id_2 = id(id_1)
+        # id_3 = id(x1)
+        # id_4 = id(id_3)
+        # add_1 = add(id_2, id_4)
+        # id_5 = id(add_1)
+        # y = id(id_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        id_1 = ops.identity()(add_0)
+        id_2 = ops.identity()(id_1)
+        id_3 = ops.identity()(X1)
+        id_4 = ops.identity()(id_3)
+        add_1 = ops.elementwise(FuncEnum.ADD)(id_2, id_4)
+        id_5 = ops.identity()(add_1)
+        Y = ops.identity()(id_5)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x1_pt
+            id_1_pt = add_0_pt
+            id_2_pt = id_1_pt
+            id_3_pt = x1_pt
+            id_4_pt = id_3_pt
+            add_1_pt = id_2_pt + id_4_pt
+            id_5_pt = add_1_pt
+            y_pt = id_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_removable_id(
+        self,
+        test_name="non_removable_id",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # y = id(x0)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        Y = ops.identity()(X0)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "identity")
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = x0_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {"x0": x0_pt}
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_removable_id_2(
+        self,
+        test_name="non_removable_id_2",
+        dtype="float16",
+    ):
+        # make a graph like below:
+        # x0 = tensor
+        # x1 = tensor
+        # add_0 = add(x0, x1)
+        # id_1 = id(add_0)
+        # y0 = id(id_1)
+        # y1 = id(x0)
+        # y2 = add(y0, y1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        M = 10
+        X0 = gen_input_tensor([batch_dim, IntImm(M)], name="x0", dtype=dtype)
+        X1 = gen_input_tensor([batch_dim, IntImm(M)], name="x1", dtype=dtype)
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        id_1 = ops.identity()(add_0)
+        Y0 = ops.identity()(id_1)
+        Y0._attrs["name"] = "output_0"
+        Y0._attrs["is_output"] = True
+        Y1 = ops.identity()(X0)
+        Y1._attrs["name"] = "output_1"
+        Y1._attrs["is_output"] = True
+        Y2 = ops.elementwise(FuncEnum.ADD)(Y0, Y1)
+        Y2._attrs["name"] = "output_2"
+        Y2._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model(
+            [Y0, Y1, Y2], target, "./tmp", test_name, dll_name=dll_name
+        )
+        self._test_id += 1
+
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "identity")
+        id_cnt = 0
+        add_cnt = 0
+        for sorted_op in sorted_ops:
+            if sorted_op._attrs["op"] == "identity":
+                id_cnt += 1
+            elif sorted_op._attrs["op"] == "fused_elementwise":
+                add_cnt += 1
+        self.assertEqual(id_cnt, 1)
+        self.assertEqual(add_cnt, 2)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            add_0_pt = x0_pt + x1_pt
+            id_1_pt = add_0_pt
+            y0_pt = id_1_pt
+            y1_pt = x0_pt
+            y2_pt = y0_pt + y1_pt
+
+            y0 = get_torch_empty_tensor(y0_pt.size(), dtype)
+            y1 = get_torch_empty_tensor(y1_pt.size(), dtype)
+            y2 = get_torch_empty_tensor(y2_pt.size(), dtype)
+            inputs = {"x0": x0_pt, "x1": x1_pt}
+            module.run_with_tensors(inputs, [y0, y1, y2])
+            torch.testing.assert_close(y0_pt, y0, atol=0.01, rtol=0.01)
+            torch.testing.assert_close(y1_pt, y1, atol=0.01, rtol=0.01)
+            torch.testing.assert_close(y2_pt, y2, atol=0.01, rtol=0.01)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_remove_no_op_concats.py b/tests/unittest/compiler/test_remove_no_op_concats.py
new file mode 100644
index 000000000..d1d5d2d28
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_concats.py
@@ -0,0 +1,147 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Sequence
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op
+
+
+class TestRemoveNoOpConcats(unittest.TestCase):
+    """
+    Tests the compiler's behavior of removing no-op concats.
+
+    NOTE: Whenever we include an empty input tensor, the non-empty input tensor
+    must be rank 1. That's because AIT's concat expects all its inputs to have
+    the same rank and have matching dimension sizes except along the
+    concatenating dimension.
+
+    We run the following tests:
+    # These are no-ops
+    1. inputs=[non-empty]
+    2. inputs=[rank-1 empty, rank-1 non-empty, rank-1 empty]
+    3. inputs=[empty]
+    4. inputs=[empty, empty]
+
+    # These are meaningful
+    5. inputs=[non-empty, non-empty]
+    6. inputs=[non-empty, empty, non-empty]
+
+    # These should have exceptions
+    7. inputs=[rank-2 non-empty, rank-1 empty]
+    8. inputs=[rank-2 non-empty, rank-2 empty]
+    """
+
+    def test_remove_no_op_concats_no_ops(self):
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[2, 4, 6]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_non_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0], [3], [0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_non_empty_and_double_empty",
+        )
+
+    def test_remove_no_op_concats_no_ops_all_empty(self):
+        """Below we test when all the input tensors are empty. fx2ait will fail
+        in these cases. However, it's possible to create it directly in AIT.
+        Therefore, we test this case and treat it as a no-op.
+        """
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0, 0, 0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_single_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[0, 0, 0], [0, 0, 0]],
+            should_keep_concat=False,
+            test_name="test_remove_no_op_concats_double_empty",
+        )
+
+    def test_remove_no_op_concats_meaningful(self):
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[3, 5], [3, 5]],
+            should_keep_concat=True,
+            test_name="test_remove_no_op_concats_double_non_empty",
+        )
+
+        self._test_remove_no_op_concats_impl(
+            input_shapes=[[3], [0], [5]],
+            should_keep_concat=True,
+            test_name="test_remove_no_op_concats_two_non_empty_and_empty",
+        )
+
+    def test_remove_no_op_concats_exceptions(self):
+        """We expect this to raise an exception in these test cases."""
+
+        # AIT expects all concat inputs to have the same rank.
+        with self.assertRaises(RuntimeError):
+            self._test_remove_no_op_concats_impl(
+                input_shapes=[[2, 4], [0]],
+                should_keep_concat=False,
+                test_name="test_remove_no_op_concats_same_rank",
+            )
+
+        # AIT expects all concat inputs to have the same dimension sizes except for the concat_dim.
+        with self.assertRaises(RuntimeError):
+            self._test_remove_no_op_concats_impl(
+                input_shapes=[[2, 4], [0, 0]],
+                should_keep_concat=False,
+                test_name="test_remove_no_ops_concat_same_dim_sizes",
+            )
+
+    def _test_remove_no_op_concats_impl(
+        self,
+        input_shapes: Sequence[Sequence[int]],
+        should_keep_concat: bool,
+        test_name: str,
+    ):
+        inputs = [
+            Tensor(shape=shape, name=f"input_{i}", is_input=True)
+            for i, shape in enumerate(input_shapes)
+        ]
+        concatenated = ops.concatenate()(inputs)
+        c = Tensor(shape=[1], name="input_const", is_input=True)
+        model_output = (concatenated * c) + (concatenated / c)
+        model_output._attrs["name"] = "output_0"
+        model_output._attrs["is_output"] = True
+
+        inputs_pt = {
+            f"input_{i}": get_random_torch_tensor(shape=shape)
+            for i, shape in enumerate(input_shapes)
+        }
+        concatenated_pt = torch.concat(list(inputs_pt.values()))
+        c_pt = get_random_torch_tensor(shape=[1])
+        Y_pt = (concatenated_pt * c_pt) + (concatenated_pt / c_pt)
+        Y_ait = torch.empty_like(Y_pt)
+
+        with compile_model(model_output, detect_target(), "./tmp", test_name) as module:
+            module.run_with_tensors(
+                {**inputs_pt, "input_const": c_pt}, {"output_0": Y_ait}
+            )
+
+            self.assertEquals(
+                graph_has_op(module.debug_sorted_graph, "concatenate"),
+                should_keep_concat,
+            )
+            self.assertTrue(torch.allclose(Y_pt, Y_ait, atol=1e-2, rtol=1e-2))
diff --git a/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py b/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py
new file mode 100644
index 000000000..274ccdc5a
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_dynamic_slices.py
@@ -0,0 +1,153 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.tensor.dynamic_slice import MAX_INT32
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    graph_has_op,
+)
+
+
+class TestRemoveNoOpDynamicSlices(unittest.TestCase):
+    """
+    Tests the compiler's behavior when removing no-op dynamic slices.
+    """
+
+    def test_remove_no_op_dynamic_slices(self):
+        TEST_CASES = (
+            # These are no-ops.
+            {
+                # X[:]
+                "input_shape": [100],
+                "start_indices": [None],
+                "end_indices": [None],
+                "should_keep_dynamic_slice": False,
+            },
+            {
+                # X[0:]
+                "input_shape": [100],
+                "start_indices": [0],
+                "end_indices": [None],
+                "should_keep_dynamic_slice": False,
+            },
+            {
+                # X[:2_147_483_647, ]
+                "input_shape": [100, 100],
+                "start_indices": [None, 0],
+                "end_indices": [MAX_INT32, None],
+                "should_keep_dynamic_slice": False,
+            },
+            # These are meaningful.
+            {
+                # X[-7:-7]
+                "input_shape": [10],
+                "start_indices": [-7],
+                "end_indices": [-7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[7:, -7:, 0:]
+                "input_shape": [10, 10, 10],
+                "start_indices": [7, -7, 0],
+                "end_indices": [None, None, None],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[:7, :-7, :0]
+                "input_shape": [10, 10, 10],
+                "start_indices": [None, None, None],
+                "end_indices": [7, -7, 0],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[0:7, 0:-7]
+                "input_shape": [10, 10],
+                "start_indices": [0, 0],
+                "end_indices": [7, -7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[-7:7, 7:-7]
+                "input_shape": [10, 10],
+                "start_indices": [-7, 7],
+                "end_indices": [7, -7],
+                "should_keep_dynamic_slice": True,
+            },
+            {
+                # X[-7:7, 7:-7, :]
+                "input_shape": [10, 10, 10],
+                "start_indices": [-7, 7, None],
+                "end_indices": [7, -7, None],
+                "should_keep_dynamic_slice": True,
+            },
+        )
+
+        for i, test_kwargs in enumerate(TEST_CASES):
+            start_indices = ",".join(map(str, test_kwargs["start_indices"]))
+            end_indices = ",".join(map(str, test_kwargs["end_indices"]))
+
+            with self.subTest(
+                start=start_indices,
+                end=end_indices,
+                keep=test_kwargs["should_keep_dynamic_slice"],
+            ):
+                self._test_remove_no_op_dynamic_slices_impl(
+                    **test_kwargs,
+                    test_name=f"test_remove_no_op_dynamic_slice_{i}",
+                )
+
+    def _test_remove_no_op_dynamic_slices_impl(
+        self,
+        input_shape: List[int],
+        start_indices: List[int],
+        end_indices: List[int],
+        should_keep_dynamic_slice: bool,
+        test_name: str,
+    ):
+        X = gen_input_tensor(shape=input_shape, name="input_0")
+        X_sliced = ops.dynamic_slice()(X, start_indices, end_indices)
+        c = gen_input_tensor(shape=[1], name="input_const")
+        model_output = (X_sliced * c) + (X_sliced / c)
+        model_output._attrs["name"] = "output_0"
+        model_output._attrs["is_output"] = True
+
+        X_pt = get_random_torch_tensor(shape=input_shape)
+        slices = [slice(s, e) for s, e in zip(start_indices, end_indices)]
+        X_sliced_pt = X_pt[slices]
+        c_pt = get_random_torch_tensor(shape=[1])
+        Y_pt = (X_sliced_pt * c_pt) + (X_sliced_pt / c_pt)
+        Y_ait = torch.empty_like(Y_pt)
+
+        # NOTE: We don't run every optimization pass to avoid fusion between
+        # dynamic_slice and elementwise.
+        with compile_model(
+            model_output, detect_target(), "/tmp", test_name, do_optimize_graph=False
+        ) as module:
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_const": c_pt}, {"output_0": Y_ait}
+            )
+
+            self.assertEqual(
+                graph_has_op(module.debug_sorted_graph, "dynamic_slice"),
+                should_keep_dynamic_slice,
+            )
+            self.assertTrue(torch.allclose(Y_pt, Y_ait, atol=1e-2, rtol=1e-3))
diff --git a/tests/unittest/compiler/test_remove_no_op_splits.py b/tests/unittest/compiler/test_remove_no_op_splits.py
new file mode 100644
index 000000000..8dec163cb
--- /dev/null
+++ b/tests/unittest/compiler/test_remove_no_op_splits.py
@@ -0,0 +1,168 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import List, Sequence, Union
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    gen_input_tensor,
+    get_random_torch_tensor,
+    graph_has_op,
+)
+
+
+class TestRemoveNoOpSplits(unittest.TestCase):
+    """
+    Tests _remove_no_op_splits() in remove_no_ops.py
+    """
+
+    def test_remove_no_op_split(self):
+        """
+        Test cases:
+        0. No-op split with split_size_or_sections as integer
+        1. No-op split with split_size_or_sections as a singleton list
+        2. No-op split with split_size > length along split_dim
+        3. No-op split with split_dim = -1
+        4. Meaningful split
+        5. Meaningful split with split_dim = -1
+        6. No-op split is a model output
+        7. Meaningful split is a model output
+        """
+
+        test_cases = (
+            # Split is a no-op.
+            {
+                "split_input_shape": (5,),
+                "split_size_or_sections": 5,
+                "split_dim": 0,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_0",
+            },
+            {
+                "split_input_shape": (5,),
+                "split_size_or_sections": [5],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_1",
+            },
+            {
+                "split_input_shape": (2, 3, 4),
+                "split_size_or_sections": 10,  # split_size > length along dim=1
+                "split_dim": 1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_2",
+            },
+            {
+                "split_input_shape": (2, 3, 4, 5),
+                "split_size_or_sections": [5],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": True,
+                "test_name": "test_remove_no_op_split_no_op_3",
+            },
+            # Split is meaningful.
+            {
+                "split_input_shape": (7,),
+                "split_size_or_sections": 2,
+                "split_dim": 0,
+                "split_is_output": False,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_meaningful_4",
+            },
+            {
+                "split_input_shape": (2, 3, 4, 5),
+                "split_size_or_sections": [2, 1, 2],
+                "split_dim": -1,
+                "split_is_output": False,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_meaningful_5",
+            },
+            # Split is a model output.
+            {
+                "split_input_shape": (9,),
+                "split_size_or_sections": [9],
+                "split_dim": 0,
+                "split_is_output": True,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_output_6",
+            },
+            {
+                "split_input_shape": (1, 9),
+                "split_size_or_sections": [4, 5],
+                "split_dim": -1,
+                "split_is_output": True,
+                "should_remove_no_op_split": False,
+                "test_name": "test_remove_no_op_split_output_7",
+            },
+        )
+
+        for i, test_kwargs in enumerate(test_cases):
+            with self.subTest(test_no=i):
+                self._test_remove_no_op_split_impl(**test_kwargs)
+
+    def _test_remove_no_op_split_impl(
+        self,
+        split_input_shape: Sequence[int],
+        split_size_or_sections: Union[int, List[int]],
+        split_dim: int,
+        split_is_output: bool,
+        should_remove_no_op_split: bool,
+        test_name: str,
+    ):
+        # Define model graph.
+        X = gen_input_tensor(shape=split_input_shape, name="input_0")
+        c = gen_input_tensor(shape=(1,), name="input_1")
+        Zs = ops.split()(X, split_size_or_sections, split_dim)
+
+        model_outputs = []
+        for i, Z in enumerate(Zs):
+            out = Z if split_is_output else Z + c
+            out._attrs["name"] = f"output_{i}"
+            out._attrs["is_output"] = True
+            model_outputs.append(out)
+
+        # Run PyTorch.
+        X_pt = get_random_torch_tensor(shape=split_input_shape)
+        c_pt = get_random_torch_tensor(shape=(1,))
+        Zs_pt = torch.split(X_pt, split_size_or_sections, split_dim)
+        outputs_pt = Zs_pt if split_is_output else [Z_pt + c_pt for Z_pt in Zs_pt]
+
+        # Run AIT.
+        with compile_model(
+            model_outputs, detect_target(), "./tmp", test_name
+        ) as module:
+            inputs_pt = (
+                {"input_0": X_pt}
+                if split_is_output
+                else {"input_0": X_pt, "input_1": c_pt}
+            )
+            outputs_ait = {
+                f"output_{i}": torch.empty_like(out_pt)
+                for (i, out_pt) in enumerate(outputs_pt)
+            }
+            module.run_with_tensors(inputs_pt, outputs_ait)
+
+            self.assertNotEqual(
+                graph_has_op(module.debug_sorted_graph, "split"),
+                should_remove_no_op_split,
+            )
+            for out_pt, out_ait in zip(outputs_pt, outputs_ait.values()):
+                self.assertTrue(torch.allclose(out_pt, out_ait, atol=1e-2, rtol=1e-3))
diff --git a/tests/unittest/compiler/test_remove_unused_ops.py b/tests/unittest/compiler/test_remove_unused_ops.py
index c1014fab3..b0e7d94a0 100644
--- a/tests/unittest/compiler/test_remove_unused_ops.py
+++ b/tests/unittest/compiler/test_remove_unused_ops.py
@@ -20,6 +20,10 @@
 
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -30,12 +34,13 @@ def _test_remove_unused_ops(
         batch_size=(1, 3),
         X_shape=(5, 10),
         test_name="test_remove_unused_ops",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -43,7 +48,7 @@ def _test_remove_unused_ops(
         Y2 = ops.getitem()(Y1, 1)
         CONST_X = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
             value=Y2._attrs["int_var"].value(),
@@ -57,10 +62,10 @@ def _test_remove_unused_ops(
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype)
             Y_pt = X_pt + X_pt.size(1)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.size(), dtype)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
@@ -69,9 +74,12 @@ def _test_remove_unused_ops(
                 len(graph_utils.get_sorted_ops(module.debug_sorted_graph)), 1
             )
 
-    def test_remove_unused_ops(self):
+    def test_remove_unused_ops_float16(self):
         self._test_remove_unused_ops()
 
+    def test_remove_unused_ops_float32(self):
+        self._test_remove_unused_ops(dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_bmm_fusion.py b/tests/unittest/compiler/test_slice_bmm_fusion.py
new file mode 100644
index 000000000..223011bab
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_bmm_fusion.py
@@ -0,0 +1,623 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SliceBMMFusionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(SliceBMMFusionTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _bmm_parameters(self, bmm_op_name, batch_sizes, M, N, K):
+        """
+        Return a dict of parameters used for constructing bmm ops
+        """
+        B_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+        M_dim = shape_utils.gen_int_var_min_max(M) if isinstance(M, list) else IntImm(M)
+        N_dim = shape_utils.gen_int_var_min_max(N) if isinstance(N, list) else IntImm(N)
+        K_dim = shape_utils.gen_int_var_min_max(K) if isinstance(K, list) else IntImm(K)
+        a_shape = {
+            "r": [B_dim, M_dim, K_dim],
+            "c": [B_dim, K_dim, M_dim],
+        }
+        b_shape = {
+            "r": [B_dim, K_dim, N_dim],
+            "c": [B_dim, N_dim, K_dim],
+        }
+        c_shape = {
+            "r": [B_dim, M_dim, N_dim],
+            "c": [B_dim, N_dim, M_dim],
+        }
+        permute = {
+            "r": None,
+            "c": [0, 2, 1],
+        }
+        bmm_op_name = bmm_op_name[:7]
+        a_layout = bmm_op_name[4]
+        b_layout = bmm_op_name[5]
+        c_layout = bmm_op_name[6]
+        bmm_dict = {}
+        bmm_dict["a_shape"] = a_shape.get(a_layout)
+        bmm_dict["b_shape"] = b_shape.get(b_layout)
+        bmm_dict["c_shape"] = c_shape.get(c_layout)
+        bmm_dict["a_permute"] = permute.get(a_layout)
+        bmm_dict["b_permute"] = permute.get(b_layout)
+        bmm_dict["c_permute"] = permute.get(c_layout)
+        return bmm_dict
+
+    def _test_slice_bmm_xxx_fusion_a(
+        self,
+        bmm_op_fn,
+        M,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        expected_num_tensors,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # bmm(slice_output, B)
+        assert (
+            len(slice_input_shape) == 3
+        ), f"expected {slice_input_shape=} to have a rank of 3"
+        Batch = slice_input_shape[0]
+        batch_sizes = [1, Batch]
+        bmm_op = bmm_op_fn()
+        bmm_params = self._bmm_parameters(bmm_op._attrs["op"], batch_sizes, M, N, K)
+        a_shape = bmm_params["a_shape"]
+
+        slice_input_tensor_shape = [a_shape[0]] + [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape[1:]
+        ]
+        X = Tensor(
+            shape=slice_input_tensor_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        assert shape_utils.is_same_shape(
+            a_shape, A.shape()
+        ), f"expected {a_shape=} and {A.shape()=} are the same shape"
+        b_shape = bmm_params["b_shape"]
+        B = Tensor(
+            shape=b_shape,
+            dtype=dtype,
+            name="b",
+            is_input=True,
+        )
+        input_tensors = [A, B]
+        c_shape = bmm_params["c_shape"]
+        has_add = "_add" in bmm_op._attrs["op"]
+        if has_add:
+            D = Tensor(
+                shape=c_shape,
+                dtype=dtype,
+                name="d",
+                is_input=True,
+            )
+            input_tensors.append(D)
+        Y = bmm_op(*input_tensors)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        dynamic_dim = [d for d in slice_input_shape[1:] if isinstance(d, list)]
+        assert (
+            len(dynamic_dim) == 0 or len(dynamic_dim) == 1
+        ), f"expected at most one dynamic dim besides batch dim in {slice_input_shape=}"
+        if len(dynamic_dim) == 1:
+            assert len(dynamic_dim[0]) == len(
+                batch_sizes
+            ), f"expected {dynamic_dim[0]} and {batch_sizes=} have the same rank"
+        for idx, batch in enumerate(batch_sizes):
+            input_shape_pt = [batch] + [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape[1:]
+            ]
+            x_pt = get_random_torch_tensor(input_shape_pt, dtype)
+            b_pt = get_random_torch_tensor(
+                [batch, b_shape[1].value(), b_shape[2].value()], dtype
+            )
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            a_pt = x_pt[slice_indices]
+
+            a_permute = bmm_params["a_permute"]
+            bmm_a_pt = a_pt
+            if a_permute is not None:
+                bmm_a_pt = a_pt.permute(a_permute)
+            b_permute = bmm_params["b_permute"]
+            bmm_b_pt = b_pt
+            if b_permute is not None:
+                bmm_b_pt = b_pt.permute(b_permute)
+            y_pt = torch.bmm(bmm_a_pt, bmm_b_pt)
+            c_permute = bmm_params["c_permute"]
+            bmm_y_pt = y_pt
+            if c_permute is not None:
+                bmm_y_pt = y_pt.permute(c_permute)
+
+            inputs = {"x": x_pt, "b": b_pt}
+            if has_add:
+                d_pt = get_random_torch_tensor(
+                    [batch, c_shape[-2].value(), c_shape[-1].value()], dtype
+                )
+                inputs["d"] = d_pt
+                bmm_y_pt = bmm_y_pt + d_pt
+            y = get_torch_empty_tensor(bmm_y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(bmm_y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_slice_bmm_rcr_fusion_a(self):
+        # non-fusible due to the odd K
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 7, None]
+        K = 5
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, 4, None]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 7, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcr_fusion_a",
+        )
+
+    def test_slice_bmm_rrr_fusion_a(self):
+        # non-fusible
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 7
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=8,
+            expected_num_ops=3,
+            test_name="slice_bmm_rrr_fusion_a",
+        )
+
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrr_fusion_a",
+        )
+
+    def test_slice_bmm_rrc_fusion_a(self):
+        # non-fusible due to dynamic dimension
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, [10, 20], K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrc_fusion_dynamic_a",
+        )
+
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        K = 2
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=8,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrc_fusion_a",
+        )
+
+    def test_slice_bmm_crr_fusion_a(self):
+        # non-fusible
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        M = 3
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_crr_add,
+            M=M,
+            N=6,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, M),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=9,
+            expected_num_ops=4,
+            test_name="slice_bmm_crr_fusion_a",
+        )
+
+        slice_start_indices = [0, 3, 0]
+        slice_end_indices = [None, 6, None]
+        M = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_crr_add,
+            M=M,
+            N=6,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, M),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_crr_fusion_a",
+        )
+
+    def test_slice_bmm_rcc_fusion_a(self):
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 7, None]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_a(
+            bmm_op_fn=ops.bmm_rcc,
+            M=(slice_end_indices[1] - slice_start_indices[1]),
+            N=4,
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            test_name="slice_bmm_rcc_fusion_a",
+        )
+
+    def _test_slice_bmm_xxx_fusion_b(
+        self,
+        bmm_op_fn,
+        M,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        expected_num_tensors,
+        expected_num_ops,
+        test_name,
+        dtype="float16",
+    ):
+        # bmm(A, slice_output)
+        assert (
+            len(slice_input_shape) == 3
+        ), f"expected {slice_input_shape=} to have a rank of 3"
+        Batch = slice_input_shape[0]
+        batch_sizes = [1, Batch]
+        bmm_op = bmm_op_fn()
+        bmm_params = self._bmm_parameters(bmm_op._attrs["op"], batch_sizes, M, N, K)
+        b_shape = bmm_params["b_shape"]
+
+        slice_input_tensor_shape = [b_shape[0]] + [
+            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
+            for d in slice_input_shape[1:]
+        ]
+        X = Tensor(
+            shape=slice_input_tensor_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        B = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        a_shape = bmm_params["a_shape"]
+        A = Tensor(
+            shape=a_shape,
+            dtype=dtype,
+            name="a",
+            is_input=True,
+        )
+        assert shape_utils.is_same_shape(
+            b_shape, B.shape()
+        ), f"expected {b_shape=} and {B.shape()=} are the same shape"
+        input_tensors = [A, B]
+        c_shape = bmm_params["c_shape"]
+        has_add = "_add" in bmm_op._attrs["op"]
+        if has_add:
+            D = Tensor(
+                shape=c_shape,
+                dtype=dtype,
+                name="d",
+                is_input=True,
+            )
+            input_tensors.append(D)
+        Y = bmm_op(*input_tensors)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test_{}.so".format(self.test_count)
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), expected_num_tensors)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), expected_num_ops)
+
+        dynamic_dim = [d for d in slice_input_shape[1:] if isinstance(d, list)]
+        assert (
+            len(dynamic_dim) == 0 or len(dynamic_dim) == 1
+        ), f"expected at most one dynamic dim besides batch dim in {slice_input_shape=}"
+        if len(dynamic_dim) == 1:
+            assert len(dynamic_dim[0]) == len(
+                batch_sizes
+            ), f"expected {dynamic_dim[0]} and {batch_sizes=} have the same rank"
+        for idx, batch in enumerate(batch_sizes):
+            input_shape_pt = [batch] + [
+                d[idx] if isinstance(d, list) else d for d in slice_input_shape[1:]
+            ]
+            x_pt = get_random_torch_tensor(input_shape_pt, dtype)
+            a_pt = get_random_torch_tensor(
+                [batch, a_shape[1].value(), a_shape[2].value()], dtype
+            )
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            b_pt = x_pt[slice_indices]
+
+            a_permute = bmm_params["a_permute"]
+            bmm_a_pt = a_pt
+            if a_permute is not None:
+                bmm_a_pt = a_pt.permute(a_permute)
+            b_permute = bmm_params["b_permute"]
+            bmm_b_pt = b_pt
+            if b_permute is not None:
+                bmm_b_pt = b_pt.permute(b_permute)
+            y_pt = torch.bmm(bmm_a_pt, bmm_b_pt)
+            c_permute = bmm_params["c_permute"]
+            bmm_y_pt = y_pt
+            if c_permute is not None:
+                bmm_y_pt = y_pt.permute(c_permute)
+
+            inputs = {"x": x_pt, "a": a_pt}
+            if has_add:
+                d_pt = get_random_torch_tensor(
+                    [batch, c_shape[-2].value(), c_shape[-1].value()], dtype
+                )
+                inputs["d"] = d_pt
+                bmm_y_pt = bmm_y_pt + d_pt
+            y = get_torch_empty_tensor(bmm_y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(bmm_y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_slice_bmm_rrc_fusion_b(self):
+        slice_start_indices = [0, 2, 0]
+        slice_end_indices = [None, 6, None]
+        N = 2
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrc_add,
+            M=8,
+            N=N,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, N),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_rrc_fusion_b",
+        )
+
+    def test_slice_bmm_crc_fusion_b(self):
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        N = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_crc_add,
+            M=8,
+            N=N,
+            K=(slice_end_indices[1] - slice_start_indices[1]),
+            slice_input_shape=(2, 10, N),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_crc_fusion_b",
+        )
+
+    def test_slice_bmm_ccr_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 0, 2]
+        slice_end_indices = [None, None, 6]
+        N = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccr_add,
+            M=6,
+            N=N,
+            K=(slice_end_indices[-1] - slice_start_indices[-1]),
+            slice_input_shape=(2, N, 7),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_ccr_fusion_b",
+        )
+
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccr_add,
+            M=8,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_ccr_fusion_b",
+        )
+
+    def test_slice_bmm_ccc_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccc_add,
+            M=5,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=9,
+            expected_num_ops=4,
+            test_name="slice_bmm_ccc_fusion_b",
+        )
+
+        slice_start_indices = [0, 1, 0]
+        slice_end_indices = [None, 6, None]
+        K = 4
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_ccc_add,
+            M=8,
+            N=(slice_end_indices[1] - slice_start_indices[1]),
+            K=K,
+            slice_input_shape=(2, 10, K),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            test_name="slice_bmm_ccc_fusion_b",
+        )
+
+    def test_slice_bmm_rrr_fusion_b(self):
+        # non-fusible
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, None, 4]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=9,
+            N=(slice_end_indices[-1] - slice_start_indices[-1]),
+            K=K,
+            slice_input_shape=(2, K, 7),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrr_fusion_b",
+        )
+
+        # non-fusible due to dynamic cim
+        slice_start_indices = [0, 0, 0]
+        slice_end_indices = [None, None, 4]
+        K = 8
+        self._test_slice_bmm_xxx_fusion_b(
+            bmm_op_fn=ops.bmm_rrr_add,
+            M=4,
+            N=(slice_end_indices[-1] - slice_start_indices[-1]),
+            K=K,
+            slice_input_shape=(2, K, [10, 20]),
+            slice_start_indices=slice_start_indices,
+            slice_end_indices=slice_end_indices,
+            expected_num_tensors=5,
+            expected_num_ops=2,
+            test_name="slice_bmm_rrr_fusion_dynamic_b",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_elemwise_fusion.py b/tests/unittest/compiler/test_slice_elemwise_fusion.py
index 6ed7a7cc6..fce4b2637 100644
--- a/tests/unittest/compiler/test_slice_elemwise_fusion.py
+++ b/tests/unittest/compiler/test_slice_elemwise_fusion.py
@@ -21,6 +21,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -29,18 +33,24 @@ def __init__(self, *args, **kwargs):
         super(SliceElemwiseFusionTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
 
+    # "read_types" attribute contains a list of tuples like
+    # [("input0", "uint4"), ("input1", "half")]. This helper function returns
+    # the list of the second elements, i.e. read_t types for all inputs.
+    def _get_read_types(self, op):
+        return list({t for _, t in op._attrs["read_types"]})
+
     def _test_slice_elemwise_fusion(
         self,
         slice_input_shape,
         slice_start_indices,
         slice_end_indices,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
+        dtype="float16",
     ):
-        dtype = "float16"
         X1 = Tensor(
             shape=slice_input_shape,
             dtype=dtype,
@@ -77,13 +87,15 @@ def _test_slice_elemwise_fusion(
         self.assertEqual(len(sorted_graph), 3)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        # import pdb; pdb.set_trace()
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         # Run PyTorch
-        x1_pt = torch.randn(*slice_input_shape).cuda().half()
-        x2_pt = torch.randn(*input_x2_shape).cuda().half()
+        x1_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        x2_pt = get_random_torch_tensor(input_x2_shape, dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -96,7 +108,7 @@ def _test_slice_elemwise_fusion(
             "input_x1": x1_pt,
             "input_x2": x2_pt,
         }
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -107,7 +119,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(2,),
             slice_end_indices=(None,),
             test_name="slice_elemwise_fusion",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -116,7 +128,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 8),
             test_name="slice_elemwise_fusion",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
         )
@@ -125,7 +137,7 @@ def test_slice_elemwise_fusion(self):
             slice_start_indices=(0, 3, 0),
             slice_end_indices=(None, 5, None),
             test_name="slice_elemwise_fusion",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -137,7 +149,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(2, 0),
             slice_end_indices=(3, None),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=(4, 16),
@@ -148,7 +160,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 3),
@@ -160,7 +172,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 0, 2),
             slice_end_indices=(None, None, 7),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 3, 1),
@@ -172,7 +184,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(4, 10, 3),
@@ -182,7 +194,7 @@ def test_slice_elemwise_fusion_broadcast(self):
             slice_start_indices=(0, 0, 3),
             slice_end_indices=(None, None, 4),
             test_name="slice_elemwise_fusion_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(20, 3),
@@ -194,12 +206,12 @@ def _test_slice_elemwise_fusion_dynamic(
         slice_start_indices,
         slice_end_indices,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         input_x2_shape=None,
+        dtype="float16",
     ):
-        dtype = "float16"
         x_shape = [
             shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
             for d in slice_input_shape
@@ -246,7 +258,7 @@ def _test_slice_elemwise_fusion_dynamic(
             is_input=True,
         )
 
-        Y1 = ops.elementwise(FuncEnum.TANH)(X2)
+        Y1 = ops.elementwise(FuncEnum.RELU)(X2)
         Y2 = ops.elementwise(FuncEnum.SUB)(Y1, X2)
         Y = ops.elementwise(FuncEnum.ADD)(slice_output, Y2)
         Y._attrs["name"] = "y"
@@ -262,7 +274,8 @@ def _test_slice_elemwise_fusion_dynamic(
         self.assertEqual(len(sorted_graph), 3)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -285,14 +298,14 @@ def _test_slice_elemwise_fusion_dynamic(
                 input_x2_shape_pt = [
                     d[idx] if isinstance(d, list) else d for d in input_x2_shape
                 ]
-            x1_pt = torch.randn(*x_shape_pt).cuda().half()
-            x2_pt = torch.randn(*input_x2_shape_pt).cuda().half()
+            x1_pt = get_random_torch_tensor(x_shape_pt, dtype)
+            x2_pt = get_random_torch_tensor(input_x2_shape_pt, dtype)
 
             slice_indices = [
                 slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
             ]
             slice_output_pt = x1_pt[slice_indices]
-            y1_pt = torch.tanh(x2_pt)
+            y1_pt = torch.relu(x2_pt)
             y2_pt = y1_pt - x2_pt
             y_pt = slice_output_pt + y2_pt
 
@@ -301,7 +314,7 @@ def _test_slice_elemwise_fusion_dynamic(
                 "input_x1": x1_pt,
                 "input_x2": x2_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -312,7 +325,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 3),
             slice_end_indices=(None, 7),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
         )
@@ -321,7 +334,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 16),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -330,7 +343,7 @@ def test_slice_elemwise_fusion_dynamic(self):
             slice_start_indices=(0, 0, 7, 0),
             slice_end_indices=(None, None, 10, None),
             test_name="slice_elemwise_fusion_dynamic",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
         )
@@ -342,7 +355,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 4, 0),
             slice_end_indices=(None, 5, None),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint4",
+            expected_max_read_t="uint4",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=([5, 16], 4, 16),
@@ -353,7 +366,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(1, 10, 15),
@@ -363,7 +376,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(10, 1, 15),
@@ -374,7 +387,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 0),
             slice_end_indices=(None, None, 8),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint",
+            expected_max_read_t="uint",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=(10, 8),
@@ -384,7 +397,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 5),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             input_x2_shape=(3, [5, 16], 10, 15),
@@ -394,7 +407,7 @@ def test_slice_elemwise_fusion_dynamic_broadcast(self):
             slice_start_indices=(0, 0, 4),
             slice_end_indices=(None, None, 12),
             test_name="slice_elemwise_fusion_dynamic_broadcast",
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
             input_x2_shape=([3, 7], [5, 16], 10, 8),
@@ -407,12 +420,12 @@ def _test_two_slice_elemwise_fusion_dynamic(
         slice_end_indices1,
         slice_start_indices2,
         slice_end_indices2,
-        expected_read_t,
+        expected_max_read_t,
         expected_op_t,
         expected_data_t,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
         x_shape = [
             shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
             for d in slice_input_shape
@@ -449,7 +462,8 @@ def _test_two_slice_elemwise_fusion_dynamic(
         self.assertEqual(len(sorted_graph), 2)
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(self._get_read_types(sorted_ops[0]), [expected_max_read_t])
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
@@ -463,7 +477,7 @@ def _test_two_slice_elemwise_fusion_dynamic(
             x_shape_pt = [
                 d[idx] if isinstance(d, list) else d for d in slice_input_shape
             ]
-            x1_pt = torch.randn(*x_shape_pt).cuda().half()
+            x1_pt = get_random_torch_tensor(x_shape_pt, dtype)
 
             slice_indices1 = [
                 slice(i, j) for i, j in zip(slice_start_indices1, slice_end_indices1)
@@ -479,7 +493,7 @@ def _test_two_slice_elemwise_fusion_dynamic(
             inputs = {
                 "input_x1": x1_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -491,7 +505,7 @@ def test_two_slice_elemwise_fusion_dynamic(self):
             slice_end_indices1=(None, 8),
             slice_start_indices2=(0, 16),
             slice_end_indices2=(None, 20),
-            expected_read_t="uint2",
+            expected_max_read_t="uint2",
             expected_op_t="half2",
             expected_data_t="half",
             test_name="two_slice_elemwise_fusion_dynamic",
@@ -502,12 +516,91 @@ def test_two_slice_elemwise_fusion_dynamic(self):
             slice_end_indices1=(None, 7),
             slice_start_indices2=(0, 4),
             slice_end_indices2=(None, 8),
-            expected_read_t="half",
+            expected_max_read_t="half",
             expected_op_t="half",
             expected_data_t="half",
             test_name="two_slice_elemwise_fusion_dynamic",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_elemwise_fusion_float(self):
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 20, 30),
+            slice_start_indices=(0, 3, 0),
+            slice_end_indices=(None, 5, None),
+            test_name="slice_elemwise_fusion_float",
+            expected_max_read_t="uint2",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(10, 16),
+            slice_start_indices=(2, 0),
+            slice_end_indices=(3, None),
+            test_name="slice_elemwise_fusion_broadcast_float",
+            expected_max_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(4, 16),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion(
+            slice_input_shape=(1, 1, 10),
+            slice_start_indices=(0, 0, 2),
+            slice_end_indices=(None, None, 7),
+            test_name="slice_elemwise_fusion_broadcast_float_2",
+            expected_max_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(10, 3, 1),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], [4, 10], 16),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 16),
+            test_name="slice_elemwise_fusion_dynamic_float",
+            expected_max_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(None, None, 5),
+            test_name="slice_elemwise_fusion_dynamic_broadcast_float",
+            expected_max_read_t="float",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(1, 10, 15),
+            dtype="float",
+        )
+        self._test_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([5, 16], 10, 10),
+            slice_start_indices=(0, 0, 0),
+            slice_end_indices=(None, None, 8),
+            test_name="slice_elemwise_fusion_dynamic_broadcast_float",
+            expected_max_read_t="uint2",
+            expected_op_t="float",
+            expected_data_t="float",
+            input_x2_shape=(10, 8),
+            dtype="float",
+        )
+        self._test_two_slice_elemwise_fusion_dynamic(
+            slice_input_shape=([3, 50], 100),
+            slice_start_indices1=(0, 4),
+            slice_end_indices1=(None, 8),
+            slice_start_indices2=(0, 16),
+            slice_end_indices2=(None, 20),
+            expected_max_read_t="uint4",
+            expected_op_t="float",
+            expected_data_t="float",
+            test_name="two_slice_elemwise_fusion_dynamic_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_slice_gemm_fusion.py b/tests/unittest/compiler/test_slice_gemm_fusion.py
index 60d5dd2de..0559ec97b 100644
--- a/tests/unittest/compiler/test_slice_gemm_fusion.py
+++ b/tests/unittest/compiler/test_slice_gemm_fusion.py
@@ -22,8 +22,17 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
+from parameterized import parameterized
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SliceGemmFusionTestCase(unittest.TestCase):
@@ -39,9 +48,8 @@ def _test_slice_gemm_rcr_fusion_a(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -82,9 +90,9 @@ def _test_slice_gemm_rcr_fusion_a(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        b_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
+        b_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -93,7 +101,7 @@ def _test_slice_gemm_rcr_fusion_a(
         y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -146,7 +154,17 @@ def test_slice_gemm_rcr_fusion_a(self):
 
     # This is a test for testing cases where we correctly update a/b_alignment
     # based on input_accessors
-    def test_slice_gemm_rcr_fusion_align(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float")],
+            }
+        )
+    )
+    def test_slice_gemm_rcr_fusion_align(self, dtype):
+        if dtype == "float" and int(detect_target()._arch) < 80:
+            self.skipTest("gemm with float tensors requires CUDA sm >= 80")
         # [slice_end_indices[0] - slice_start_indices[0]] = M
         # [slice_end_indices[1] - slice_start_indices[1]] = K
         # a = [M, K]
@@ -166,6 +184,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 2),
             slice_end_indices=(None, 18),
             test_name="slice_gemm_rcr_fusion_a",
+            dtype=dtype,
         )
         # Next, make another one with a larger alignment.
         # If we don't update a/b_alignment accordingly, we would end up with
@@ -177,6 +196,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 8),
             slice_end_indices=(None, 24),
             test_name="slice_gemm_rcr_fusion_a",
+            dtype=dtype,
         )
 
         # another set of tests for a/b alignments
@@ -187,6 +207,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 6),
             slice_end_indices=(None, 10),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_fusion_b(
             M=21,
@@ -195,6 +216,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 8),
             slice_end_indices=(None, 12),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_fusion_b(
             M=21,
@@ -203,6 +225,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 10),
             slice_end_indices=(None, 14),
             test_name="slice_gemm_rcr_fusion_b",
+            dtype=dtype,
         )
 
         # another set of tests for a/b alignments
@@ -214,6 +237,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 10),
             slice_end_indices=(None, 12),
             test_name="slice_gemm_rcr_bias_add",
+            dtype=dtype,
         )
         self._test_slice_gemm_rcr_bias_add(
             M=5,
@@ -223,6 +247,7 @@ def test_slice_gemm_rcr_fusion_align(self):
             slice_start_indices=(0, 16),
             slice_end_indices=(None, 18),
             test_name="slice_gemm_rcr_bias_add",
+            dtype=dtype,
         )
 
         # restore old env
@@ -240,9 +265,8 @@ def _test_slice_gemm_rcr_fusion_b(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_A = Tensor(
             shape=[M, K],
             dtype=dtype,
@@ -277,8 +301,8 @@ def _test_slice_gemm_rcr_fusion_b(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        a_pt = torch.randn(M, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
+        a_pt = get_random_torch_tensor([M, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -287,7 +311,7 @@ def _test_slice_gemm_rcr_fusion_b(
         y_pt = torch.nn.functional.linear(a_pt, b_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, a_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -322,9 +346,8 @@ def _test_slice_gemm_rcr_fusion_a_2(
         slice_end_indices,
         test_name,
         no_fusion=False,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         X = Tensor(
             shape=slice_input_shape,
             dtype=dtype,
@@ -373,8 +396,8 @@ def _test_slice_gemm_rcr_fusion_a_2(
             self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(M).cuda().half()
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([M], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -383,7 +406,7 @@ def _test_slice_gemm_rcr_fusion_a_2(
         y_pt = torch.nn.functional.linear(a_pt, a_pt, bias=bias_pt)
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, bias_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         dll_name = "test_{}.so".format(self.test_count)
@@ -419,9 +442,8 @@ def _test_slice_gemm_rcr_bias_add(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -470,10 +492,10 @@ def _test_slice_gemm_rcr_bias_add(
         self.assertEqual(len(sorted_ops), 1)
 
         # Run PyTorch
-        b_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
-        d_pt = torch.randn(M, N).cuda().half()
+        b_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
+        d_pt = get_random_torch_tensor([M, N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -483,7 +505,7 @@ def _test_slice_gemm_rcr_bias_add(
         y_pt = y2_pt + d_pt
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors([input_pt, b_pt, bias_pt, d_pt], [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -552,9 +574,8 @@ def _test_slice_gemm_rcr_fusion_dynamic(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -606,12 +627,12 @@ def _test_slice_gemm_rcr_fusion_dynamic(
         assert Ms is not None, "expected to have at least one dynamic dim"
         for idx in range(len(Ms)):
             # Run PyTorch
-            b_pt = torch.randn(N, K).cuda().half()
+            b_pt = get_random_torch_tensor([N, K], dtype)
             input_shape_pt = [
                 d[idx] if isinstance(d, list) else d for d in slice_input_shape
             ]
-            input_pt = torch.randn(*input_shape_pt).cuda().half()
-            bias_pt = torch.randn(N).cuda().half()
+            input_pt = get_random_torch_tensor(input_shape_pt, dtype)
+            bias_pt = get_random_torch_tensor([N], dtype)
 
             slice_indices = [
                 slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -620,7 +641,7 @@ def _test_slice_gemm_rcr_fusion_dynamic(
             y_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
             # Run AITemplate module.
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors([input_pt, b_pt, bias_pt], [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -663,9 +684,8 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         slice_start_indices,
         slice_end_indices,
         test_name,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         tensor_B1 = Tensor(
             shape=[N, K],
             dtype=dtype,
@@ -714,10 +734,10 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         self.assertEqual(len(sorted_ops), 2)
 
         # Run PyTorch
-        b1_pt = torch.randn(N, K).cuda().half()
-        b2_pt = torch.randn(N, K).cuda().half()
-        input_pt = torch.randn(*slice_input_shape).cuda().half()
-        bias_pt = torch.randn(N).cuda().half()
+        b1_pt = get_random_torch_tensor([N, K], dtype)
+        b2_pt = get_random_torch_tensor([N, K], dtype)
+        input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+        bias_pt = get_random_torch_tensor([N], dtype)
 
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -728,7 +748,7 @@ def _test_slice_multiple_gemm_rcr_fusion_a(
         y_pt = y1_pt + y2_pt
 
         # Run AITemplate module.
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(
             {
                 "input_x": input_pt,
@@ -763,6 +783,90 @@ def test_slice_multiple_gemm_rcr_fusion_a(self):
             test_name="slice_multiple_gemm_rcr_fusion_a",
         )
 
+    def test_slice_gemm_fusion_float_sm80(self):
+        self._test_slice_gemm_rcr_fusion_a(
+            N=4,
+            K=8,
+            slice_input_shape=(2, 8),
+            slice_start_indices=(0, 0),
+            slice_end_indices=(None, None),
+            test_name="slice_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=32,
+            K=16,
+            slice_input_shape=(24, 16),
+            slice_start_indices=(3, 0),
+            slice_end_indices=(15, None),
+            test_name="slice_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_b(
+            M=24,
+            K=16,
+            slice_input_shape=(32, 32),
+            slice_start_indices=(0, 16),
+            slice_end_indices=(None, 32),
+            test_name="slice_gemm_rcr_fusion_b_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 24),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a_2(
+            M=8,
+            slice_input_shape=(8, 23),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_a_2_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_bias_add(
+            M=4,
+            N=2,
+            K=8,
+            slice_input_shape=(4, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_bias_add_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_a(
+            N=5,
+            K=4,
+            slice_input_shape=(13, 2, 32),
+            slice_start_indices=(0, 0, 10),
+            slice_end_indices=(None, None, 14),
+            test_name="slice_nd_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+        self._test_slice_gemm_rcr_fusion_dynamic(
+            N=4,
+            K=8,
+            slice_input_shape=([4, 9], 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 16),
+            test_name="slice_gemm_rcr_fusion_dynamic_float",
+            dtype="float",
+        )
+        self._test_slice_multiple_gemm_rcr_fusion_a(
+            N=4,
+            K=16,
+            slice_input_shape=(30, 32),
+            slice_start_indices=(0, 8),
+            slice_end_indices=(None, 24),
+            test_name="slice_multiple_gemm_rcr_fusion_a_float",
+            dtype="float",
+        )
+
+
+filter_test_cases_by_test_env(SliceGemmFusionTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_permute021_fusion.py b/tests/unittest/compiler/test_slice_permute021_fusion.py
new file mode 100644
index 000000000..d7e8b40c8
--- /dev/null
+++ b/tests/unittest/compiler/test_slice_permute021_fusion.py
@@ -0,0 +1,167 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class SlicePermute021FusionTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SlicePermute021FusionTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_slice_permute021_fusion(
+        self,
+        N,
+        K,
+        slice_input_shape,
+        slice_start_indices,
+        slice_end_indices,
+        dims,
+        test_name,
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=slice_input_shape,
+            dtype=dtype,
+            name="input_x",
+            is_input=True,
+        )
+        slice_op = ops.dynamic_slice()
+        tensor_A = slice_op(
+            X, start_indices=slice_start_indices, end_indices=slice_end_indices
+        )
+        tensor_A._attrs["name"] = "slice_output"
+
+        permute_op = ops.permute021()
+        Y = permute_op(tensor_A)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        with compile_model(
+            Y,
+            target,
+            "./tmp",
+            f"{test_name}_{self._test_id}",
+            dll_name=f"test_{self._test_id}.so",
+        ) as module:
+            self._test_id += 1
+
+            # Verify the generated graph.
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), 2)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), 1)
+
+            # Run PyTorch
+            input_pt = get_random_torch_tensor(slice_input_shape, dtype)
+
+            slice_indices = [
+                slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
+            ]
+            a_pt = input_pt[slice_indices]
+            y_pt = torch.permute(a_pt, dims)
+
+            # Run AITemplate module.
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            module.run_with_tensors([input_pt], [y])
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_slice_permute021_fusion(self):
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 2, 8),
+            slice_start_indices=(0, 0, 4),
+            slice_end_indices=(2, 2, 8),
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 2, 8),
+            slice_start_indices=(0, 1, 0),
+            slice_end_indices=(2, 3, 8),
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[2, 9, 4],
+            slice_start_indices=[0, 0, 1],
+            slice_end_indices=[None, None, 3],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[3, 4, 120],
+            slice_start_indices=[0, 0, 3],
+            slice_end_indices=[None, None, 110],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=[3, 121, 4],
+            slice_start_indices=[0, 5, 0],
+            slice_end_indices=[None, 115, None],
+            dims=(0, 2, 1),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 3, 8, 62),
+            slice_start_indices=(0, 0, 0, 2),
+            slice_end_indices=(2, 3, 8, 50),
+            dims=(0, 1, 3, 2),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+        self._test_slice_permute021_fusion(
+            N=2,
+            K=2,
+            slice_input_shape=(2, 3, 4, 4, 8),
+            slice_start_indices=(0, 0, 0, 0, 0),
+            slice_end_indices=(2, 3, 4, 4, 2),
+            dims=(0, 1, 2, 4, 3),
+            test_name="slice_permute021",
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_slice_reshape_scatter.py b/tests/unittest/compiler/test_slice_reshape_scatter.py
index aaa38deec..d430e2c2b 100644
--- a/tests/unittest/compiler/test_slice_reshape_scatter.py
+++ b/tests/unittest/compiler/test_slice_reshape_scatter.py
@@ -21,8 +21,14 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class SliceScatterReshapeCatTestCase(unittest.TestCase):
@@ -39,30 +45,34 @@ def _run_one_test(
         reshape_to,
         input_x_shape,
         dim,
+        # when it's true, it means that the reshape can be moved to the front
+        # of the first concat op so that we can fuse all ops into a single concat
+        reshape_movable=False,
         add_tanh=False,
+        dtype="float16",
     ):
         target = detect_target()
 
-        input_X_pt = torch.randn(input_x_shape).cuda().half()
+        input_X_pt = get_random_torch_tensor(input_x_shape, dtype)
 
         Ys_pt = []
         Xs_pt = []
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
             Ys_pt.append(Y_pt)
         Y1_pt = torch.cat(Ys_pt, dim)
         Y2_pt = torch.reshape(Y1_pt, reshape_to)
-        Y_pt = torch.cat([Y2_pt, input_X_pt], dim=dim)
+        Y_pt = torch.cat([input_X_pt, Y2_pt, input_X_pt], dim=dim)
         if add_tanh:
             Y_pt = torch.tanh(Y_pt)
 
         input_X = Tensor(
-            shape=input_x_shape, dtype="float16", name="input_x", is_input=True
+            shape=input_x_shape, dtype=dtype, name="input_x", is_input=True
         )
         Ys = []
         for idx, (input_shape, start_indices, end_indices) in enumerate(
@@ -70,7 +80,7 @@ def _run_one_test(
         ):
             slice_op = ops.dynamic_slice()
             X_name = "input_{}".format(idx)
-            X = Tensor(shape=input_shape, dtype="float16", name=X_name, is_input=True)
+            X = Tensor(shape=input_shape, dtype=dtype, name=X_name, is_input=True)
             Y = slice_op(X, start_indices=start_indices, end_indices=end_indices)
             Ys.append(Y)
         concat_op = ops.concatenate()
@@ -79,12 +89,12 @@ def _run_one_test(
         concat_op_2 = ops.concatenate()
         if add_tanh:
             concat_op_2 = ops.concatenate_tanh()
-        Y = concat_op_2([Y2, input_X], dim)
+        Y = concat_op_2([input_X, Y2, input_X], dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logger.info(
+        _LOGGER.info(
             "AITemplate output_0 shape: {}, pt shape: {}".format(y_shape, Y_pt.size())
         )
         np.testing.assert_equal(y_shape, Y_pt.size())
@@ -93,29 +103,51 @@ def _run_one_test(
         module = compile_model(
             Y, target, "./tmp", "slice_scatter_reshape_cat", dll_name=dll_name
         )
-        Y_src_ops = Y._attrs["src_ops"]
-        np.testing.assert_equal(len(Y_src_ops), 2)
-        np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
-        np.testing.assert_equal(concat_op_2._attrs["input_masks"], [False, True])
-        Y_src_ops_list = list(Y_src_ops)
-        slice_reshape_scatter_op = (
-            Y_src_ops_list[1] if concat_op_2 == Y_src_ops_list[0] else Y_src_ops_list[0]
-        )
-        np.testing.assert_equal(
-            slice_reshape_scatter_op._attrs["op"], "slice_reshape_scatter"
-        )
+        Y_src_ops = list(Y._attrs["src_ops"])
+        if reshape_movable:
+            np.testing.assert_equal(len(Y_src_ops), 1)
+            np.testing.assert_equal(Y_src_ops[0]._attrs["op"], "concatenate")
+        else:
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            np.testing.assert_equal(concat_op_2 in Y_src_ops, True)
+            np.testing.assert_equal(
+                concat_op_2._attrs["input_masks"], [True, False, True]
+            )
+            slice_reshape_scatter_op = (
+                Y_src_ops[1] if concat_op_2 == Y_src_ops[0] else Y_src_ops[0]
+            )
+            np.testing.assert_equal(
+                slice_reshape_scatter_op._attrs["op"], "slice_reshape_scatter"
+            )
 
         input_name_to_index = module.get_input_name_to_index_map()
         inputs = [0 for i in range(len(Xs_pt) + 1)]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
         inputs[input_name_to_index["input_x"]] = input_X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
-    def test_slice_scatter_reshape(self):
+    def test_slice_scatter_reshape_sm80(self):
+        self._run_one_test(
+            input_shapes=[[1, 2], [1, 2]],
+            input_start_indices=[[0, 0], [0, 0]],
+            input_end_indices=[[1, 2], [1, 2]],
+            reshape_to=[1, 2, 2],
+            input_x_shape=[1, 1, 2],
+            dim=1,
+            reshape_movable=True,
+        )
+        self._run_one_test(
+            input_shapes=[[10, 20], [15, 44]],
+            input_start_indices=[[1, 5], [2, 10]],
+            input_end_indices=[[4, 15], [5, 22]],
+            reshape_to=[3, 2, 11],
+            input_x_shape=[3, 1, 11],
+            dim=1,
+        )
         self._run_one_test(
             input_shapes=[[8, 16], [20, 30]],
             input_start_indices=[[0, 4], [12, 2]],
@@ -134,6 +166,84 @@ def test_slice_scatter_reshape(self):
             add_tanh=True,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_scatter_reshape_float(self):
+        self._run_one_test(
+            input_shapes=[[8, 16], [20, 30]],
+            input_start_indices=[[0, 4], [12, 2]],
+            input_end_indices=[[4, 14], [16, 8]],
+            reshape_to=[4, 2, 8],
+            input_x_shape=[4, 5, 8],
+            dim=1,
+            dtype="float",
+        )
+
+    def test_slice_scatter_reshape_float16_2(self):
+        dtype = "float16"
+        input_shape = [2, 6]
+        input0 = Tensor(shape=input_shape, dtype=dtype, name="input0", is_input=True)
+        input1 = Tensor(shape=input_shape, dtype=dtype, name="input1", is_input=True)
+        input2_shape = [2, 3, 2]
+        input2 = Tensor(shape=input2_shape, dtype=dtype, name="input2", is_input=True)
+
+        start_indices = [0, 0]
+        end_indices = [None, 2]
+        slice_0 = ops.dynamic_slice()(
+            input0, start_indices=start_indices, end_indices=end_indices
+        )
+        slice_1 = ops.dynamic_slice()(
+            input0, start_indices=start_indices, end_indices=end_indices
+        )
+        concat_dim = 1
+        concat_2 = ops.concatenate()([slice_0, slice_1], concat_dim)
+        reshape_to = [-1, 2, 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to)
+
+        slice_4 = ops.dynamic_slice()(
+            input1, start_indices=start_indices, end_indices=end_indices
+        )
+        slice_5 = ops.dynamic_slice()(
+            input1, start_indices=start_indices, end_indices=end_indices
+        )
+        concat_6 = ops.concatenate()([slice_4, slice_5], concat_dim)
+        reshape_7 = ops.reshape()(concat_6, reshape_to)
+
+        Y = ops.concatenate()([input2, reshape_3, reshape_7], concat_dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = "test.so"
+        test_name = "slice_scatter_reshape_cat_float16_2"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 1)
+        self.assertEqual(Y_src_ops[0]._attrs["op"], "concatenate")
+
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+        input0_pt = get_random_torch_tensor(input_shape, dtype)
+        slice_0_pt = input0_pt[slice_indices]
+        slice_1_pt = input0_pt[slice_indices]
+        concat_2_pt = torch.cat([slice_0_pt, slice_1_pt], concat_dim)
+        reshape_3_pt = torch.reshape(concat_2_pt, reshape_to)
+
+        input1_pt = get_random_torch_tensor(input_shape, dtype)
+        slice_4_pt = input1_pt[slice_indices]
+        slice_5_pt = input1_pt[slice_indices]
+        concat_6_pt = torch.cat([slice_4_pt, slice_5_pt], concat_dim)
+        reshape_7_pt = torch.reshape(concat_6_pt, reshape_to)
+
+        input2_pt = get_random_torch_tensor(input2_shape, dtype)
+        y_pt = torch.cat([input2_pt, reshape_3_pt, reshape_7_pt], concat_dim)
+
+        inputs = {"input0": input0_pt, "input1": input1_pt, "input2": input2_pt}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+filter_test_cases_by_test_env(SliceScatterReshapeCatTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_scatter_pattern.py b/tests/unittest/compiler/test_slice_scatter_pattern.py
index a537d11fe..01700d96b 100644
--- a/tests/unittest/compiler/test_slice_scatter_pattern.py
+++ b/tests/unittest/compiler/test_slice_scatter_pattern.py
@@ -22,6 +22,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 
@@ -36,8 +40,8 @@ def _make_slice_ops(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
-        input_type="float16",
     ):
         Ys = []
         for idx, (input_shape, start_indices, end_indices) in enumerate(
@@ -46,16 +50,14 @@ def _make_slice_ops(
             slice_op = ops.dynamic_slice()
             X_name = "input_{}".format(idx)
             if batch_sizes is None:
-                X = Tensor(
-                    shape=input_shape, dtype=input_type, name=X_name, is_input=True
-                )
+                X = Tensor(shape=input_shape, dtype=dtype, name=X_name, is_input=True)
             else:
                 X = Tensor(
                     shape=[
                         IntVar(values=batch_sizes, name="input_batch_{}".format(idx)),
                         *input_shape,
                     ],
-                    dtype=input_type,
+                    dtype=dtype,
                     name=X_name,
                     is_input=True,
                 )
@@ -69,10 +71,16 @@ def _make_test_graph(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
     ):
         Ys = self._make_slice_ops(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         concat_op = ops.concatenate()
         Y = concat_op(Ys, dim)
@@ -86,10 +94,16 @@ def _graph_transformation_test(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
         batch_sizes=None,
     ):
         graph = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         graph = transform.toposort(graph)
         transform.name_graph(graph)
@@ -107,7 +121,9 @@ def _graph_transformation_test(
         for idx, x in enumerate(fused_op._attrs["inputs"]):
             self.assertEqual(x._attrs["name"], "input_{}".format(idx))
 
-    def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
+    def _e2e_test(
+        self, input_shapes, input_start_indices, input_end_indices, dim, dtype
+    ):
         logging.info(
             "e2e test with input_shapes {}, start_indices {}, end_indices {}".format(
                 input_shapes, input_start_indices, input_end_indices
@@ -121,7 +137,7 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -129,7 +145,7 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         Y_pt = torch.cat(Ys_pt, dim)
 
         Y = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         logging.info(
@@ -146,13 +162,19 @@ def _e2e_test(self, input_shapes, input_start_indices, input_end_indices, dim):
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
 
     def _e2e_batch_test(
-        self, input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+        self,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype,
+        batch_sizes,
     ):
         logging.info(
             "e2e batch test with batch_sizes {}, input_shapes{}, "
@@ -164,7 +186,12 @@ def _e2e_batch_test(
         target = detect_target()
 
         Y = self._make_test_graph(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
 
@@ -181,7 +208,7 @@ def _e2e_batch_test(
             for input_shape, start_indices, end_indices in zip(
                 input_shapes, input_start_indices, input_end_indices
             ):
-                X_pt = torch.randn([batch, *input_shape]).cuda().half()
+                X_pt = get_random_torch_tensor([batch, *input_shape], dtype)
                 Xs_pt.append(X_pt)
                 slice_indices = [
                     slice(i, j) for i, j in zip(start_indices, end_indices)
@@ -193,27 +220,50 @@ def _e2e_batch_test(
             inputs = [0 for i in range(len(Xs_pt))]
             for i, X_pt in enumerate(Xs_pt):
                 inputs[input_name_to_index[f"input_{i}"]] = X_pt
-            y = torch.empty(y_shape).cuda().half()
+            y = get_torch_empty_tensor(y_shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
     def _run_one_test(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         self._graph_transformation_test(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
-        self._e2e_test(input_shapes, input_start_indices, input_end_indices, dim)
+        self._e2e_test(input_shapes, input_start_indices, input_end_indices, dim, dtype)
 
     def _run_one_batch_test(
-        self, *, batch_sizes, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        batch_sizes,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         self._graph_transformation_test(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
         self._e2e_batch_test(
-            input_shapes, input_start_indices, input_end_indices, dim, batch_sizes
+            input_shapes,
+            input_start_indices,
+            input_end_indices,
+            dim,
+            dtype,
+            batch_sizes,
         )
 
     def test_slice_scatter(self):
@@ -258,21 +308,22 @@ def _make_test_graph_multi_dsts(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
     ):
         Ys = self._make_slice_ops(
             input_shapes,
             input_start_indices,
             input_end_indices,
             dim,
+            dtype,
         )
-        input_type = "float16"
         # make the first input tensor have multiple uses
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
         X0_shape = [d._attrs["values"][0] for d in X0._attrs["shape"]]
         num_slice_inputs = len(input_shapes)
         X1_name = f"input_{num_slice_inputs}"
-        X1 = Tensor(shape=X0_shape, dtype=input_type, name=X1_name, is_input=True)
+        X1 = Tensor(shape=X0_shape, dtype=dtype, name=X1_name, is_input=True)
         concat_op = ops.concatenate()
         Y0 = concat_op(Ys, dim)
         Y0._attrs["name"] = "output_0"
@@ -286,7 +337,13 @@ def _make_test_graph_multi_dsts(
         return (Y0, Y1)
 
     def _test_slice_scatter_multi_dsts(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         """test cases where a tensor being sliced has multiple dsts"""
 
@@ -301,7 +358,7 @@ def _test_slice_scatter_multi_dsts(
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -309,12 +366,12 @@ def _test_slice_scatter_multi_dsts(
         Y0_pt = torch.cat(Ys_pt, dim)
 
         input0_shape = Xs_pt[0].size()
-        other_X_pt = torch.randn(input0_shape).cuda().half()
+        other_X_pt = get_random_torch_tensor(input0_shape, dtype)
         Xs_pt.append(other_X_pt)
         Y1_pt = Xs_pt[0] + other_X_pt
 
         Y0, Y1 = self._make_test_graph_multi_dsts(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
 
         y0_shape = [var._attrs["values"][0] for var in Y0._attrs["shape"]]
@@ -340,8 +397,8 @@ def _test_slice_scatter_multi_dsts(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y0 = torch.empty(y0_shape).cuda().half()
-        y1 = torch.empty(y1_shape).cuda().half()
+        y0 = get_torch_empty_tensor(y0_shape, dtype)
+        y1 = get_torch_empty_tensor(y1_shape, dtype)
         module.run_with_tensors(inputs, [y0, y1])
         self.assertTrue(torch.allclose(Y0_pt, y0, atol=1e-2, rtol=1e-2))
         self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-2, rtol=1e-2))
@@ -361,6 +418,7 @@ def _make_test_graph_multi_dsts_2(
         input_start_indices,
         input_end_indices,
         dim,
+        dtype,
     ):
         """Make a graph where (1) a tensor is sliced twice and both slices are
         fed into the same concat op, and (2) another sliced output (i.e not
@@ -372,6 +430,7 @@ def _make_test_graph_multi_dsts_2(
             input_start_indices,
             input_end_indices,
             dim,
+            dtype,
         )
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
@@ -394,7 +453,13 @@ def _make_test_graph_multi_dsts_2(
         return Y
 
     def _test_slice_scatter_multi_dsts_2(
-        self, *, input_shapes, input_start_indices, input_end_indices, dim
+        self,
+        *,
+        input_shapes,
+        input_start_indices,
+        input_end_indices,
+        dim,
+        dtype="float16",
     ):
         logging.info(
             f"multi_dsts_2 e2e test with input_shapes: {input_shapes}, "
@@ -407,7 +472,7 @@ def _test_slice_scatter_multi_dsts_2(
         for input_shape, start_indices, end_indices in zip(
             input_shapes, input_start_indices, input_end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
@@ -425,7 +490,7 @@ def _test_slice_scatter_multi_dsts_2(
         Y_pt = torch.cat(Ys_pt, dim)
 
         Y = self._make_test_graph_multi_dsts_2(
-            input_shapes, input_start_indices, input_end_indices, dim
+            input_shapes, input_start_indices, input_end_indices, dim, dtype
         )
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
@@ -449,7 +514,7 @@ def _test_slice_scatter_multi_dsts_2(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -468,6 +533,38 @@ def test_slice_scatter_multi_dsts_2(self):
             dim=2,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_scatter_float(self):
+        self._run_one_test(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+            dtype="float",
+        )
+        self._run_one_batch_test(
+            batch_sizes=[1024, 4, 128],
+            input_shapes=[[3], [3]],
+            input_start_indices=[[1, 1], [0, 0]],
+            input_end_indices=[[2, 3], [1, 2]],
+            dim=0,
+            dtype="float",
+        )
+        self._test_slice_scatter_multi_dsts(
+            input_shapes=[[4, 3, 4], [3, 7, 10]],
+            input_start_indices=[[1, 0, -3], [0, 2, 1]],
+            input_end_indices=[[3, 3, 4], [2, 5, -1]],
+            dim=2,
+            dtype="float",
+        )
+        self._test_slice_scatter_multi_dsts_2(
+            input_shapes=[[2, 3, 5], [3, 7, 10]],
+            input_start_indices=[[0, 1, 0], [0, 1, 3]],
+            input_end_indices=[[2, 2, 4], [2, 7, 7]],
+            dim=1,
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_slice_view_strided.py b/tests/unittest/compiler/test_slice_view_strided.py
index 65ebc9393..0630f52e6 100644
--- a/tests/unittest/compiler/test_slice_view_strided.py
+++ b/tests/unittest/compiler/test_slice_view_strided.py
@@ -18,26 +18,59 @@
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import graph_utils
 
+from parameterized import parameterized
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 5e-2, "rtol": 5e-2},
+    "float32": {"atol": 5e-2, "rtol": 5e-2},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
 
 class SliceViewStridedOpTestCase(unittest.TestCase):
-    def test_slice_view_gemm_fusible(self):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_view_gemm_fusible(self, dtype):
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
-        input0 = test_utils.gen_input_tensor([batch_dim, 2 * N, N], name="input0")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, 2 * N, N], dtype=dtype, name="input0"
+        )
         X0 = ops.dynamic_slice()(input0, [None, None, None], [None, N, None])
         X1 = ops.reshape()(X0, [-1, N * N])
-        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        input1 = test_utils.gen_input_tensor([N, N * N], dtype=dtype, name="input1")
         Y = ops.gemm_rcr()(X1, input1)
         Y._attrs["name"] = "output0"
         Y._attrs["is_output"] = True
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_fusible")
+        module = compile_model(
+            [Y], target, "./tmp", f"slice_reshape_gemm_fusible_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -48,12 +81,12 @@ def test_slice_view_gemm_fusible(self):
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
             # Run PyTorch baseline.
-            input0_pt = torch.randn([batch_size, 2 * N, N]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, 2 * N, N], dtype)
             x0_pt = input0_pt[:, :N, :]
             x1_pt = torch.reshape(x0_pt, [-1, N * N])
-            input1_pt = torch.rand([N, N * N]).cuda().half()
+            input1_pt = get_random_torch_tensor([N, N * N], dtype)
             y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -65,26 +98,36 @@ def test_slice_view_gemm_fusible(self):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
-            )
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
 
-    def test_slice_view_gemm_non_fusible(self):
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_view_gemm_non_fusible(self, dtype):
         N = 4
         batch_dim = IntVar([1, 2, 3], "batch_size")
 
-        input0 = test_utils.gen_input_tensor([batch_dim, N, 2 * N], name="input0")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, N, 2 * N], dtype=dtype, name="input0"
+        )
         X0 = ops.dynamic_slice()(input0, [None, None, None], [None, None, N])
         X1 = ops.reshape()(X0, [-1, N * N])
-        input1 = test_utils.gen_input_tensor([N, N * N], name="input1")
+        input1 = test_utils.gen_input_tensor([N, N * N], dtype=dtype, name="input1")
         Y = ops.gemm_rcr()(X1, input1)
         Y._attrs["name"] = "output0"
         Y._attrs["is_output"] = True
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", "slice_reshape_gemm_non_fusible")
+        module = compile_model(
+            [Y], target, "./tmp", f"slice_reshape_gemm_non_fusible_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -95,12 +138,12 @@ def test_slice_view_gemm_non_fusible(self):
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
             # Run PyTorch baseline.
-            input0_pt = torch.randn([batch_size, N, 2 * N]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N, 2 * N], dtype)
             x0_pt = input0_pt[:, :, :N]
             x1_pt = torch.reshape(x0_pt, [-1, N * N])
-            input1_pt = torch.rand([N, N * N]).cuda().half()
+            input1_pt = get_random_torch_tensor([N, N * N], dtype) * 0.5
             y_pt = torch.nn.functional.linear(x1_pt, input1_pt)
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -112,11 +155,307 @@ def test_slice_view_gemm_non_fusible(self):
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, y: {y}, y_pt: {y_pt}",
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_flatten_concat_fusible_1(self, dtype):
+        test_name = f"slice_flatten_concat_fusible_{dtype}"
+        batch_dim = IntVar([3, 10], "batch_size")
+        X0 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 12, 1], dtype=dtype, name="x1")
+        X2 = test_utils.gen_input_tensor([batch_dim, 10], dtype=dtype, name="x2")
+        A = test_utils.gen_input_tensor([batch_dim, 8, 48], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, 48, 40], dtype=dtype, name="b")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 39]
+        squeeze_dim = 2
+        cat_dim = 1
+        flatten_start_dim = 1
+        flatten_end_dim = -1
+
+        Y0 = ops.bmm_rrr()(A, B)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.flatten(start_dim=flatten_start_dim, end_dim=flatten_end_dim)(Y1)
+        Y3 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        Y4 = ops.squeeze(squeeze_dim)(Y3)
+        Y = ops.concatenate()([X2, Y2, Y4], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 7)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 12, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 12, 1], dtype)
+            x2_pt = get_random_torch_tensor([batch_size, 10], dtype)
+            a_pt = get_random_torch_tensor([batch_size, 8, 48], dtype)
+            b_pt = get_random_torch_tensor([batch_size, 48, 40], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt, b_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = torch.flatten(
+                y1_pt, start_dim=flatten_start_dim, end_dim=flatten_end_dim
+            )
+            y3_pt = x0_pt + x1_pt
+            y4_pt = torch.squeeze(y3_pt, dim=squeeze_dim)
+            y_pt = torch.cat([x2_pt, y2_pt, y4_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "x2": x2_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_flatten_concat_fusible_2(self, dtype):
+        test_name = f"slice_flatten_concat_fusible_{dtype}_2"
+        batch_dim = IntVar([1, 2], "batch_size")
+        X0 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="x1")
+        X2 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x2")
+        A = test_utils.gen_input_tensor([batch_dim, 2, 1], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, 1, 2], dtype=dtype, name="b")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 3]
+        reshape_to = [-1, 2]
+        cat_dim = 1
+        flatten_start_dim = 1
+        flatten_end_dim = -1
+
+        Y0 = ops.bmm_rrr()(A, B)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.flatten(start_dim=flatten_start_dim, end_dim=flatten_end_dim)(Y1)
+        Y3 = X0 + X1
+        Y4 = ops.reshape()(Y3, reshape_to)
+        Y = ops.concatenate()([Y4, Y2, X2, Y4], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            x2_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            a_pt = get_random_torch_tensor([batch_size, 2, 1], dtype)
+            b_pt = get_random_torch_tensor([batch_size, 1, 2], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt, b_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = torch.flatten(
+                y1_pt, start_dim=flatten_start_dim, end_dim=flatten_end_dim
+            )
+            y3_pt = x0_pt + x1_pt
+            y4_pt = y3_pt.reshape(*reshape_to)
+            y_pt = torch.cat([y4_pt, y2_pt, x2_pt, y4_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "x2": x2_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_reshape_concat_fusible_1(self, dtype):
+        test_name = f"slice_reshape_concat_fusible_{dtype}_1"
+        batch_dim = IntVar([1, 2], "batch_size")
+        M = 2
+        N = 2
+        K = 1
+
+        X0 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x0")
+        X1 = test_utils.gen_input_tensor([batch_dim, 1], dtype=dtype, name="x1")
+        A = test_utils.gen_input_tensor([batch_dim, K, M], dtype=dtype, name="a")
+        B = test_utils.gen_input_tensor([batch_dim, K, N], dtype=dtype, name="b")
+        D = test_utils.gen_input_tensor([N], dtype=dtype, name="d")
+
+        start_indices = [0, 0, 0]
+        end_indices = [None, None, 1]
+        reshape_to = [-1, M * (N - 1)]
+        cat_dim = 1
+
+        Y0 = ops.bmm_crr_add()(A, B, D)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.reshape()(Y1, reshape_to)
+        Y3 = ops.concatenate()([Y2, X0], dim=cat_dim)
+        Y = ops.concatenate()([Y3, X1], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, 1], dtype)
+            a_pt = get_random_torch_tensor([batch_size, K, M], dtype)
+            b_pt = get_random_torch_tensor([batch_size, K, N], dtype)
+            d_pt = get_random_torch_tensor([N], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.bmm(a_pt.permute([0, 2, 1]), b_pt)
+            y0_pt = y0_pt + d_pt
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = y1_pt.reshape(*reshape_to)
+            y3_pt = torch.cat([y2_pt, x0_pt], dim=cat_dim)
+            y_pt = torch.cat([y3_pt, x1_pt], dim=cat_dim)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "x1": x1_pt,
+                    "a": a_pt,
+                    "b": b_pt,
+                    "d": d_pt,
+                },
+                [y],
             )
 
+            # Do comparisons.
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_slice_reshape_concat_fusible_2(self, dtype):
+        test_name = f"slice_reshape_concat_fusible_{dtype}_2"
+        batch_dim = IntVar([1, 8], "batch_size")
+        M = 8
+        N = 64
+        K = 4
+
+        K1_0 = 32 * 8
+        K1_1 = 3
+
+        # K1 = 259: need padding
+        K1 = K1_0 + K1_1
+        N1 = 256
+
+        X0 = test_utils.gen_input_tensor([batch_dim, M, K], dtype=dtype, name="x0")
+        W0 = test_utils.gen_input_tensor([N, K], dtype=dtype, name="w0")
+        X1 = test_utils.gen_input_tensor([batch_dim, K1_1], dtype=dtype, name="x1")
+        W1 = test_utils.gen_input_tensor([N1, K1], dtype=dtype, name="w1")
+
+        start_indices = [0, 0, 32]
+        end_indices = [None, None, 64]
+        reshape_to = [-1, K1_0]
+        cat_dim = 1
+
+        Y0 = ops.gemm_rcr()(X0, W0)
+        Y1 = ops.dynamic_slice()(Y0, start_indices, end_indices)
+        Y2 = ops.reshape()(Y1, reshape_to)
+        Y3 = ops.concatenate()([Y2, X1], dim=cat_dim)
+        Y = ops.gemm_rcr()(Y3, W1)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model([Y], target, "./tmp", test_name)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            # Run PyTorch baseline.
+            x0_pt = get_random_torch_tensor([batch_size, M, K], dtype)
+            w0_pt = get_random_torch_tensor([N, K], dtype)
+            x1_pt = get_random_torch_tensor([batch_size, K1_1], dtype)
+            w1_pt = get_random_torch_tensor([N1, K1], dtype)
+            slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+
+            y0_pt = torch.nn.functional.linear(x0_pt, w0_pt)
+            y1_pt = y0_pt[slice_indices]
+            y2_pt = y1_pt.reshape(*reshape_to)
+            y3_pt = torch.cat([y2_pt, x1_pt], dim=cat_dim)
+            y_pt = torch.nn.functional.linear(y3_pt, w1_pt)
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "x0": x0_pt,
+                    "w0": w0_pt,
+                    "x1": x1_pt,
+                    "w1": w1_pt,
+                },
+                [y],
+            )
+
+            # Do comparisons.
+            torch.testing.assert_close(y, y_pt, **_TOLERANCE_LIMITS[dtype])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_fusion.py b/tests/unittest/compiler/test_split_bmm_fusion.py
index 5c842f1cd..95db4f8b0 100644
--- a/tests/unittest/compiler/test_split_bmm_fusion.py
+++ b/tests/unittest/compiler/test_split_bmm_fusion.py
@@ -20,12 +20,21 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import has_op
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    has_op,
+)
 from aitemplate.utils import graph_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SplitBmmFusionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _test_split_bmm_rcr_fusion(
         self,
         bmm_rcr_op,
@@ -37,9 +46,8 @@ def _test_split_bmm_rcr_fusion(
         split_dim,
         testname,
         with_padding=False,
+        dtype="float16",
     ):
-        dtype = "float16"
-
         T_A = Tensor(
             shape=[B, M, K],
             dtype=dtype,
@@ -67,8 +75,8 @@ def _test_split_bmm_rcr_fusion(
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        a = torch.randn(B, M, K).cuda().half()
-        b = torch.randn(B, N, K).cuda().half()
+        a = get_random_torch_tensor([B, M, K], dtype)
+        b = get_random_torch_tensor([B, N, K], dtype)
         xs = a.split(split_size_or_sections, split_dim)
         ys = b.split(split_size_or_sections, split_dim)
         cs = []
@@ -88,7 +96,7 @@ def _test_split_bmm_rcr_fusion(
                 f"The final graph should have only 3 tensors. "
                 f"But it has {len(graph)} tensors now."
             )
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors({"input0": a, "input1": b}, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
@@ -104,48 +112,48 @@ def test_split_bmm_rcr_fusion_static(self):
             5,
             [2, 3],
             2,
-            "test_split_bmm_rcr",
+            "test_split_bmm_rcr_with_padding",
             with_padding=True,
         )
         # bmm_rcr_n1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 160, 1, 32, 8, 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 160, 1, 32, 8, 2, "test_split_bmm_rcr_n1"
         )
         # bmm_rcr_n1, split_dim = 2
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10000, 1, 5, [2, 3], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [2, 3], 2, "test_split_bmm_rcr_n1_0"
         )
         # bmm_rcr_n1, split_dim = 2
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10000, 1, 5, [3, 2], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10000, 1, 5, [3, 2], 2, "test_split_bmm_rcr_n1_1"
         )
         # bmm_rcr_n1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr_n1, 1, 10, 1, 32, [16, 8, 8], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr_n1, 1, 10, 1, 32, [16, 8, 8], 2, "test_split_bmm_rcr_n1_2"
         )
         # bmm_rcr_n1, split_dim = 0
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 4, 10000, 1, 32, [2, 2], 0, "test_split_bmm_rcr"
+            ops.bmm_rcr, 4, 10000, 1, 32, [2, 2], 0, "test_split_bmm_rcr_n1_split_d_0"
         )
         # bmm_rcr_n1, split_dim = 1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 64, 2, 2, 32, 1, 1, "test_split_bmm_rcr"
+            ops.bmm_rcr, 64, 2, 2, 32, 1, 1, "test_split_bmm_rcr_n1_split_d_1"
         )
         # bmm_rcr
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 256, 2, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 256, 2, "test_split_bmm_rcr_0"
         )
         # bmm_rcr
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1, 10000, 3, 96, [32, 32, 32], 2, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1, 10000, 3, 96, [32, 32, 32], 2, "test_split_bmm_rcr_1"
         )
         # bmm_rcr, split_dim = 0, can only be static
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 512, 0, "test_split_bmm_rcr"
+            ops.bmm_rcr, 1024, 128, 512, 256 * 2, 512, 0, "test_split_bmm_rcr_split_d_0"
         )
         # bmm_rcr, split_dim = 1
         self._test_split_bmm_rcr_fusion(
-            ops.bmm_rcr, 1024, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr"
+            ops.bmm_rcr, 10, 512, 512, 256 * 2, 256, 1, "test_split_bmm_rcr_split_d_1"
         )
 
     def _test_split_bmm_rcr_fusion_dynamic_M(
@@ -158,8 +166,8 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
         split_size_or_sections,
         split_dim,
         testname,
+        dtype="float16",
     ):
-        dtype = "float16"
         assert isinstance(Ms, (list, tuple))
 
         T_A = Tensor(
@@ -199,8 +207,8 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
         )
 
         for M in Ms:
-            a = torch.randn(B, M, K).cuda().half()
-            b = torch.randn(B, N, K).cuda().half()
+            a = get_random_torch_tensor([B, M, K], dtype)
+            b = get_random_torch_tensor([B, N, K], dtype)
             xs = a.split(split_size_or_sections, split_dim)
             ys = b.split(split_size_or_sections, split_dim)
             cs = []
@@ -211,7 +219,7 @@ def _test_split_bmm_rcr_fusion_dynamic_M(
                 cs.append(c)
             y_pt = torch.cat(cs, dim=split_dim)
 
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors({"input0": a, "input1": b}, [y])
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
@@ -230,7 +238,7 @@ def test_split_bmm_rcr_fusion_dynamic_M(self):
         # bmm_rcr
         self._test_split_bmm_rcr_fusion_dynamic_M(
             ops.bmm_rcr,
-            1024,
+            10,
             [128, 256],
             512,
             256 * 2,
@@ -249,17 +257,16 @@ def _test_split_bmm_rcr_fusion_qkv(
         split_size_or_sections,
         split_dim=0,
         testname="test_split_qkv",
+        dtype="float16",
         should_fail=False,
     ):
-        dtype = "float16"
-
         X = Tensor(
             shape=[B, M, K],
             dtype=dtype,
             name="input0",
             is_input=True,
         )
-        scale = Tensor(shape=[], dtype="float16", name="scale", value=K ** -0.5)
+        scale = Tensor(shape=[], dtype=dtype, name="scale", value=K**-0.5)
 
         (Q, KK, V) = ops.split()(X, split_size_or_sections, split_dim)
         QK = ops.bmm_rcr()(Q, KK)
@@ -269,9 +276,9 @@ def _test_split_bmm_rcr_fusion_qkv(
         Y._attrs["name"] = "output"
         Y._attrs["is_output"] = True
 
-        a = torch.randn(B, M, K).cuda().half()
+        a = get_random_torch_tensor([B, M, K], dtype)
         (q, k, v) = a.split(split_size_or_sections, split_dim)
-        qk = torch.bmm(q, k.permute(0, 2, 1)) * K ** -0.5
+        qk = torch.bmm(q, k.permute(0, 2, 1)) * K**-0.5
         qk = torch.softmax(qk, -1)
         qkv = torch.bmm(qk, v)
         y_pt = qkv.reshape(B // 3 // NH, NH, M, K).permute([0, 2, 1, 3])
@@ -286,14 +293,94 @@ def _test_split_bmm_rcr_fusion_qkv(
         else:
             assert not has_op(sorted_ops, "split"), "The final graph has split op!"
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors({"input0": a}, [y])
-        self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
 
-    def test_split_bmm_rcr_fusion_qkv(self):
+    def test_split_bmm_rcr_fusion_qkv_sm80(self):
         self._test_split_bmm_rcr_fusion_qkv(3, 4096, 4096, 512, 1, 1)
         self._test_split_bmm_rcr_fusion_qkv(3 * 16, 1024, 1024, 256, 16, 16)
 
+    def test_split_bmm_fusion_fp32_sm80(self):
+        # bmm_rcr (K with an odd value) with padding:
+        # in this case, split and bmm_rcr are not going to be fused actually because
+        # of the padding applied to bmm_rcr.
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            1,
+            10000,
+            3,
+            5,
+            [2, 3],
+            2,
+            "test_split_bmm_rcr",
+            with_padding=True,
+            dtype="float",
+        )
+        # bmm_rcr_n1, split_dim = 2
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr_n1,
+            1,
+            10000,
+            1,
+            5,
+            [2, 3],
+            2,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            10,
+            8,
+            32,
+            16 * 2,
+            16,
+            2,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr, split_dim = 0, can only be static
+        self._test_split_bmm_rcr_fusion(
+            ops.bmm_rcr,
+            10,
+            8,
+            32,
+            16 * 2,
+            32,
+            0,
+            "test_split_bmm_rcr_float",
+            dtype="float",
+        )
+        # bmm_rcr_n1
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr_n1,
+            1,
+            [100, 160],
+            1,
+            32,
+            8,
+            2,
+            "test_split_bmm_rcr_n1_dynamic_M_float",
+            dtype="float",
+        )
+        # bmm_rcr
+        self._test_split_bmm_rcr_fusion_dynamic_M(
+            ops.bmm_rcr,
+            10,
+            [8, 16],
+            32,
+            16 * 2,
+            16,
+            2,
+            "test_split_bmm_rcr_dynamic_M_float",
+            dtype="float",
+        )
+        self._test_split_bmm_rcr_fusion_qkv(3 * 16, 10, 10, 8, 16, 16, dtype="float")
+
+
+filter_test_cases_by_test_env(SplitBmmFusionTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
index 800eff691..43017853c 100644
--- a/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
+++ b/tests/unittest/compiler/test_split_bmm_softmax_bmm.py
@@ -23,21 +23,25 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
 class SplitBMMTestCase(unittest.TestCase):
     def _test_split_reshape_bmm_permute(
-        self, bs, nheads, seq_len, hidden_size, test_name
+        self, bs, nheads, seq_len, hidden_size, test_name, dtype="float16"
     ):
         target = detect_target()
         head_dim = hidden_size // nheads
-        scale = head_dim ** -0.5
+        scale = head_dim**-0.5
 
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         input_shape = [3, batch_dim, nheads, seq_len, head_dim]
-        X = Tensor(shape=input_shape, dtype="float16", name="input_0", is_input=True)
+        X = Tensor(shape=input_shape, dtype=dtype, name="input_0", is_input=True)
         (Q, K, V) = ops.split()(X, 1, dim=0)
 
         OP = ops.bmm_softmax_bmm_permute(shape=(nheads,), scale=scale)
@@ -60,7 +64,7 @@ def _test_split_reshape_bmm_permute(
 
         for b in bs:
             input_shape = [3, b, nheads, seq_len, head_dim]
-            x_pt = torch.randn(*input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             (q_pt, k_pt, v_pt) = torch.split(x_pt, 1, dim=0)
             q_pt = q_pt.reshape(-1, seq_len, head_dim)
             k_pt = k_pt.reshape(-1, seq_len, head_dim)
@@ -74,11 +78,11 @@ def _test_split_reshape_bmm_permute(
             y_r = y_l.reshape(b, nheads, seq_len, head_dim)
             y_pt = torch.permute(y_r, [0, 2, 1, 3])
 
-            y = torch.empty([b, seq_len, nheads, head_dim]).cuda().half()
+            y = get_torch_empty_tensor([b, seq_len, nheads, head_dim], dtype)
             module.run_with_tensors([x_pt], [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_reshape_bmm_permute(self):
+    def test_split_reshape_bmm_permute_rocm(self):
         self._test_split_reshape_bmm_permute(
             bs=[1], nheads=12, seq_len=256, hidden_size=768, test_name="static"
         )
@@ -87,6 +91,9 @@ def test_split_reshape_bmm_permute(self):
         )
 
 
+filter_test_cases_by_test_env(SplitBMMTestCase)
+
+
 if __name__ == "__main__":
     torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/compiler/test_split_full_idx.py b/tests/unittest/compiler/test_split_full_idx.py
new file mode 100644
index 000000000..8984f6d0c
--- /dev/null
+++ b/tests/unittest/compiler/test_split_full_idx.py
@@ -0,0 +1,193 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class SplitGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_split_getitem(
+        self,
+        shape,
+        split_sections,
+        split_dim,
+        test_name="split_full_idx",
+        dtype="float16",
+    ):
+        assert len(shape) == 3, f"expected shape to be 3 but got {shape}"
+        target = detect_target()
+        M, N, K = shape
+
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="input_3",
+            is_input=True,
+        )
+        Y1 = ops.split()(D, split_sections, split_dim)
+        Y2 = ops.getitem()(Y1, 0)
+        Y = ops.gemm_rcr_bias_sigmoid_mul_tanh()(X, W, B, Y2)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        src_ops = set()
+        for tensor in module.debug_sorted_graph:
+            src_ops |= set(tensor.src_ops())
+            for src_op in tensor.src_ops():
+                assert not src_op._attrs["op"].startswith("split"), (
+                    f"Ecountered split op {src_op}."
+                    "Shouldn't have split op after graph optmizaiton"
+                )
+        assert len(src_ops) == 1
+
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
+        Y_pt = torch.tanh(
+            torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)) * D_pt
+        )
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt, "input_3": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem_to_noop(self):
+        self._test_split_getitem(
+            shape=(16, 32, 10),
+            split_sections=[16],
+            split_dim=0,
+        )
+        self._test_split_getitem(
+            shape=(16, 32, 10),
+            split_sections=[32],
+            split_dim=1,
+        )
+
+    def _test_split_getitem_remove_output(
+        self,
+        shape,
+        split_sections,
+        split_dim,
+        test_name="split_remove_output",
+        dtype="float16",
+    ):
+        assert len(shape) == 3, f"expected shape to be 3 but got {shape}"
+        target = detect_target()
+        M, N, K = shape
+
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[M, N],
+            dtype=dtype,
+            name="input_3",
+            is_input=True,
+        )
+        Y1 = ops.gemm_rcr_bias_sigmoid_mul_tanh()(X, W, B, D)
+        Y2 = ops.split()(Y1, split_sections, split_dim)
+        Y = ops.getitem()(Y2, 0)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        src_ops = set()
+        for tensor in module.debug_sorted_graph:
+            src_ops |= set(tensor.src_ops())
+            for src_op in tensor.src_ops():
+                assert not src_op._attrs["op"].startswith("split"), (
+                    f"Ecountered split op {src_op}."
+                    "Shouldn't have split op after graph optmizaiton"
+                )
+        assert len(src_ops) == 1
+
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
+        Y_pt = torch.tanh(
+            torch.sigmoid(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)) * D_pt
+        )
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt, "input_3": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_getitem_remove_output(self):
+        self._test_split_getitem_remove_output(
+            shape=(16, 32, 10),
+            split_sections=[16],
+            split_dim=0,
+        )
+        self._test_split_getitem_remove_output(
+            shape=(16, 32, 10),
+            split_sections=[32],
+            split_dim=1,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_large_concat.py b/tests/unittest/compiler/test_split_large_concat.py
index 10639a38b..682fed083 100644
--- a/tests/unittest/compiler/test_split_large_concat.py
+++ b/tests/unittest/compiler/test_split_large_concat.py
@@ -22,9 +22,14 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class SplitLargeConcatTestCase(unittest.TestCase):
@@ -32,16 +37,15 @@ def __init__(self, *args, **kwargs):
         super(SplitLargeConcatTestCase, self).__init__(*args, **kwargs)
         self.test_count = 0
 
-    def _make_tensors(self, num_inputs, input_shape, input_names=None):
+    def _make_tensors(self, num_inputs, input_shape, dtype, input_names=None):
         if input_names is not None:
             assert num_inputs == len(input_names)
         input_tensors = []
-        input_type = "float16"
         for i in range(num_inputs):
             name = input_names[i] if input_names is not None else f"input_{i}"
             t = Tensor(
                 shape=input_shape,
-                dtype=input_type,
+                dtype=dtype,
                 name=name,
                 is_input=True,
             )
@@ -49,12 +53,12 @@ def _make_tensors(self, num_inputs, input_shape, input_names=None):
         return input_tensors
 
     def _test_split_large_concat_simple(
-        self, cat_dim, num_inputs, input_shape, split_count, test_name
+        self, cat_dim, num_inputs, input_shape, split_count, test_name, dtype="float16"
     ):
         # a simple test: a concat takes num_inputs and the output of the concat
         # is a model output
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
-        input_tensors = self._make_tensors(num_inputs, input_shape)
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y = concat_op(input_tensors, cat_dim)
         Y._attrs["name"] = "output_0"
@@ -70,7 +74,9 @@ def _test_split_large_concat_simple(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), split_count)
 
-        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
+        inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
         y_pt = torch.cat(inputs_pt, cat_dim)
 
         # run ait
@@ -80,7 +86,7 @@ def _test_split_large_concat_simple(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -109,17 +115,17 @@ def test_split_large_concat_simple(self):
         )
 
     def _test_split_large_concat_with_add(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # y1 = concat(x1,x2...)
         # y = add(y1, x_n) where x_n is not used by concat
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
-        input_tensors = self._make_tensors(num_inputs, input_shape)
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y1 = concat_op(input_tensors, cat_dim)
         x_n_shape = [1]
-        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_ns = self._make_tensors(1, x_n_shape, dtype, ["input_x_n"])
         X_n = X_ns[0]
         Y = ops.elementwise(FuncEnum.ADD)(Y1, X_n)
         Y._attrs["name"] = "output_0"
@@ -129,8 +135,10 @@ def _test_split_large_concat_with_add(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
-        inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(num_inputs)]
-        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
+        x_n_pt = get_random_torch_tensor(x_n_shape, dtype)
         y1_pt = torch.cat(inputs_pt, cat_dim)
         inputs_pt.append(x_n_pt)
         y_pt = y1_pt + x_n_pt
@@ -142,7 +150,7 @@ def _test_split_large_concat_with_add(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -156,22 +164,22 @@ def test_split_large_concat_with_add(self):
         )
 
     def _test_split_large_concat_with_strided_add(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # y1 = add(x1, x2)
         # y2 = concat(y1, x3, ...)
         # y = add(y1, x_n) where x_n is not used by concat
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensors = self._make_tensors(
-            2, input_shape, ["add_input_0", "add_input_1"]
+            2, input_shape, dtype, ["add_input_0", "add_input_1"]
         )
         Y1 = ops.elementwise(FuncEnum.ADD)(add_input_tensors[0], add_input_tensors[1])
-        concat_input_tensors = self._make_tensors(num_inputs, input_shape)
+        concat_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         Y2 = concat_op([Y1] + concat_input_tensors, cat_dim)
         x_n_shape = [1]
-        X_ns = self._make_tensors(1, x_n_shape, ["input_x_n"])
+        X_ns = self._make_tensors(1, x_n_shape, dtype, ["input_x_n"])
         X_n = X_ns[0]
         Y = ops.elementwise(FuncEnum.ADD)(Y2, X_n)
         Y._attrs["name"] = "output_0"
@@ -181,12 +189,12 @@ def _test_split_large_concat_with_strided_add(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
-        add_inputs_pt = [torch.randn(input_shape).cuda().half() for _ in range(2)]
+        add_inputs_pt = [get_random_torch_tensor(input_shape, dtype) for _ in range(2)]
         y1_pt = add_inputs_pt[0] + add_inputs_pt[1]
         concat_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
-        x_n_pt = torch.randn(x_n_shape).cuda().half()
+        x_n_pt = get_random_torch_tensor(x_n_shape, dtype)
         y2_pt = torch.cat([y1_pt] + concat_inputs_pt, cat_dim)
         y_pt = y2_pt + x_n_pt
 
@@ -199,7 +207,7 @@ def _test_split_large_concat_with_strided_add(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -213,17 +221,17 @@ def test_split_large_concat_with_strided_add(self):
         )
 
     def _test_split_large_concat_with_strided_add_complex(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # a1 = add(x1, x2)
         # a2 = add(x3, x4)
         # ...
         # y = concat(a1, x1_1, a2, x1_2, ...)
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_inputs * 2, input_shape, add_input_tensor_names
+            num_inputs * 2, input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_inputs):
@@ -231,7 +239,7 @@ def _test_split_large_concat_with_strided_add_complex(
                 add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
             )
             add_output_tensors.append(a)
-        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         concat_op = ops.concatenate()
         concat_input_tensors = []
         for i in range(num_inputs):
@@ -246,13 +254,13 @@ def _test_split_large_concat_with_strided_add_complex(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         add_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs * 2)
         ]
         add_outputs_pt = []
         for i in range(num_inputs):
             add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
         other_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
         concat_inputs_pt = []
         for i in range(num_inputs):
@@ -269,7 +277,7 @@ def _test_split_large_concat_with_strided_add_complex(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -283,7 +291,7 @@ def test_split_large_concat_with_strided_add_complex(self):
         )
 
     def _test_split_large_concat_with_reuse(
-        self, cat_dim, num_inputs, input_shape, test_name
+        self, cat_dim, num_inputs, input_shape, test_name, dtype="float16"
     ):
         # make a model like below:
         # a1 = add(x1, x2)
@@ -293,10 +301,10 @@ def _test_split_large_concat_with_reuse(
         # other_inputs = [o1, o2...]
         # concat_input = shuffle([a1, a2...] + add_inputs[0:10] + other_inputs)
         # y = concat(concat_input)
-        logger.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
+        _LOGGER.info(f"test_split_large_concat with {num_inputs=}, {input_shape=}")
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_inputs * 2, input_shape, add_input_tensor_names
+            num_inputs * 2, input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_inputs):
@@ -304,7 +312,7 @@ def _test_split_large_concat_with_reuse(
                 add_input_tensors[i * 2], add_input_tensors[i * 2 + 1]
             )
             add_output_tensors.append(a)
-        other_input_tensors = self._make_tensors(num_inputs, input_shape)
+        other_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
         add_inputs_shuffle = list(range(len(add_input_tensors)))
         random.shuffle(add_inputs_shuffle)
         add_inputs_for_concat = [add_input_tensors[i] for i in add_inputs_shuffle[0:10]]
@@ -326,14 +334,14 @@ def _test_split_large_concat_with_reuse(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         add_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs * 2)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs * 2)
         ]
         add_outputs_pt = []
         for i in range(num_inputs):
             add_outputs_pt.append(add_inputs_pt[i * 2] + add_inputs_pt[i * 2 + 1])
         add_inputs_for_concat_pt = [add_inputs_pt[i] for i in add_inputs_shuffle[0:10]]
         other_inputs_pt = [
-            torch.randn(input_shape).cuda().half() for _ in range(num_inputs)
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
         ]
         concat_inputs_pt = add_outputs_pt + other_inputs_pt + add_inputs_for_concat_pt
         real_concat_inputs_pt = [concat_inputs_pt[i] for i in concat_inputs_shuffle]
@@ -348,7 +356,7 @@ def _test_split_large_concat_with_reuse(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -371,6 +379,7 @@ def _test_split_large_concat_with_slice(
         num_add_inputs,
         add_input_shape,
         test_name,
+        dtype="float16",
     ):
         # make a model like below:
         # s1 = t1[:, 0:10]
@@ -383,7 +392,7 @@ def _test_split_large_concat_with_slice(
         # y = concat(concat_input)
         slice_input_tensor_names = [f"slice_input_{i}" for i in range(num_slice_inputs)]
         slice_input_tensors = self._make_tensors(
-            num_slice_inputs, slice_input_shape, slice_input_tensor_names
+            num_slice_inputs, slice_input_shape, dtype, slice_input_tensor_names
         )
         slice_output_tensors = []
         for slice_input_tensor in slice_input_tensors:
@@ -394,7 +403,7 @@ def _test_split_large_concat_with_slice(
 
         add_input_tensor_names = [f"add_input_{i}" for i in range(num_add_inputs * 2)]
         add_input_tensors = self._make_tensors(
-            num_add_inputs * 2, add_input_shape, add_input_tensor_names
+            num_add_inputs * 2, add_input_shape, dtype, add_input_tensor_names
         )
         add_output_tensors = []
         for i in range(num_add_inputs):
@@ -414,14 +423,14 @@ def _test_split_large_concat_with_slice(
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
 
         slice_inputs_pt = [
-            torch.randn(slice_input_shape).cuda().half()
+            get_random_torch_tensor(slice_input_shape, dtype)
             for _ in range(num_slice_inputs)
         ]
         slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
         slice_outputs_pt = [inp_pt[slice_indices] for inp_pt in slice_inputs_pt]
 
         add_inputs_pt = [
-            torch.randn(add_input_shape).cuda().half()
+            get_random_torch_tensor(add_input_shape, dtype)
             for _ in range(num_add_inputs * 2)
         ]
         add_outputs_pt = []
@@ -439,7 +448,7 @@ def _test_split_large_concat_with_slice(
         for x_name, x_pt in zip(input_names, inputs_pt):
             inputs[input_name_to_index[x_name]] = x_pt
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -456,6 +465,121 @@ def test_split_large_concat_with_slice(self):
             test_name="split_large_concat_with_dynamic_slice",
         )
 
+    def _test_split_large_concat_with_reshape(
+        self,
+        num_inputs,
+        input_shape,
+        reshape_shape,
+        cat_dim,
+        test_name,
+        dtype="float16",
+    ):
+        # make a model like below:
+        # x = Tensor([10, 2, 20])
+        # reshape_output = reshape(t1, [10, -1])
+        # t1 = Tensor([10, 40])
+        # ...
+        # tn = Tensor([10, 40])
+        # y = concat([x, t1, ..., tn])
+        X = Tensor(
+            shape=reshape_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        reshape_output = ops.reshape()(X, input_shape)
+        normal_input_tensors = self._make_tensors(num_inputs, input_shape, dtype)
+        concat_input_tensors = [reshape_output] + normal_input_tensors
+        concat_op = ops.concatenate()
+        Y = concat_op(concat_input_tensors, cat_dim)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+
+        x_pt = get_random_torch_tensor(reshape_shape, dtype)
+        reshape_output_pt = torch.reshape(x_pt, input_shape)
+        normal_inputs_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_inputs)
+        ]
+        concat_inputs_pt = [reshape_output_pt] + normal_inputs_pt
+        y_pt = torch.cat(concat_inputs_pt, cat_dim)
+
+        # run ait
+        input_name_to_index = module.get_input_name_to_index_map()
+        inputs = [0 for i in range(len(concat_inputs_pt))]
+        input_names = [X._attrs["name"]] + [
+            i._attrs["name"] for i in normal_input_tensors
+        ]
+        for i_name, i_pt in zip(input_names, [x_pt] + normal_inputs_pt):
+            inputs[input_name_to_index[i_name]] = i_pt
+
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split_large_concat_with_reshape(self):
+        self._test_split_large_concat_with_reshape(
+            num_inputs=180,
+            input_shape=(10, 40),
+            reshape_shape=(10, 2, 20),
+            cat_dim=1,
+            test_name="split_large_concat_with_reshape",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_large_concat_float(self):
+        self._test_split_large_concat_simple(
+            cat_dim=1,
+            num_inputs=35,
+            input_shape=(2, 3),
+            split_count=2,
+            test_name="split_large_concat_simple_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3, 4),
+            test_name="split_large_concat_with_add_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_strided_add(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_strided_add_complex(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_strided_add_complex_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_reuse(
+            cat_dim=1,
+            num_inputs=136,
+            input_shape=(2, 3),
+            test_name="split_large_concat_with_reuse_float",
+            dtype="float",
+        )
+        self._test_split_large_concat_with_slice(
+            cat_dim=1,
+            num_slice_inputs=161,
+            slice_input_shape=(20, 20),
+            start_indices=[0, 0],
+            end_indices=[None, 10],
+            num_add_inputs=5,
+            add_input_shape=(20, 161 * 10),
+            test_name="split_large_concat_with_dynamic_slice_float",
+            dtype="float",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_split_large_slice_reshape_scatter.py b/tests/unittest/compiler/test_split_large_slice_reshape_scatter.py
new file mode 100644
index 000000000..d35edeeb4
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_slice_reshape_scatter.py
@@ -0,0 +1,139 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SliceScatterLargeInputsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterLargeInputsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_slice_scatter_reshape_float16(
+        self, input0_shape, input1_shape, start_indices, end_indices, reshape_movable
+    ):
+        dtype = "float16"
+
+        input0 = Tensor(shape=input0_shape, dtype=dtype, name="input0", is_input=True)
+        input1 = Tensor(shape=input1_shape, dtype=dtype, name="input1", is_input=True)
+
+        concat_dim = 1
+        end_indices_2 = end_indices.copy()
+        if not reshape_movable:
+            end_indices[concat_dim] -= 1
+            end_indices_2[concat_dim] += 1
+
+        num_slices = 140
+        slice_outputs = [
+            ops.dynamic_slice()(
+                input0,
+                start_indices=start_indices,
+                end_indices=end_indices if idx % 2 == 0 else end_indices_2,
+            )
+            for idx in range(num_slices)
+        ]
+
+        concat_2 = ops.concatenate()(slice_outputs, concat_dim)
+        reshape_to = [-1, num_slices, 2]
+        reshape_3 = ops.reshape()(concat_2, reshape_to)
+
+        Y = ops.concatenate()([reshape_3, input1], concat_dim)
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        test_name = f"slice_scatter_large_inputs_{self.test_count}"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 5)
+        if reshape_movable:
+            # If the reshape operator can be moved to the front, we will only have concatenate ops
+            self.assertTrue(all(op._attrs["op"] == "concatenate" for op in Y_src_ops))
+        else:
+            # We have a single concat op. All the rest are slice_reshape_scatter ops
+            concat_cnt = 0
+            for op in Y_src_ops:
+                if op._attrs["op"] == "concatenate":
+                    concat_cnt += 1
+                    continue
+                self.assertEqual(op._attrs["op"], "slice_reshape_scatter")
+            self.assertEqual(concat_cnt, 1)
+
+        input0_pt = get_random_torch_tensor(input0_shape, dtype)
+        input1_pt = get_random_torch_tensor(input1_shape, dtype)
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+        slice_indices_2 = [slice(i, j) for i, j in zip(start_indices, end_indices_2)]
+
+        slice_outputs_pt = [
+            input0_pt[slice_indices if idx % 2 == 0 else slice_indices_2]
+            for idx in range(num_slices)
+        ]
+        concat_2_pt = torch.cat(slice_outputs_pt, concat_dim)
+        reshape_3_pt = torch.reshape(concat_2_pt, reshape_to)
+        y_pt = torch.cat([reshape_3_pt, input1_pt], concat_dim)
+
+        inputs = {"input0": input0_pt, "input1": input1_pt}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_slice_scatter_reshape_float16(self):
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[6, 2],
+            input1_shape=[2, 4, 2],
+            start_indices=[1, 0],
+            end_indices=[3, None],
+            reshape_movable=True,
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[2, 6],
+            input1_shape=[2, 4, 2],
+            start_indices=[0, 0],
+            end_indices=[None, 2],
+            reshape_movable=True,
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[2, 6],
+            input1_shape=[2, 4, 2],
+            start_indices=[0, 0],
+            end_indices=[None, 2],
+            reshape_movable=False,
+        )
+        self._test_slice_scatter_reshape_float16(
+            input0_shape=[6, 3],
+            input1_shape=[2, 4, 2],
+            start_indices=[1, 0],
+            end_indices=[3, 2],
+            reshape_movable=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_large_slice_scatter.py b/tests/unittest/compiler/test_split_large_slice_scatter.py
new file mode 100644
index 000000000..e47152dbe
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_slice_scatter.py
@@ -0,0 +1,113 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SliceScatterLargeInputsTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceScatterLargeInputsTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 1
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_slice_scatter(
+        self, input_shape, start_indices, end_indices, concat_dim, dtype
+    ):
+        num_slices = 140
+        slice_outputs = [
+            ops.dynamic_slice()(
+                Tensor(
+                    shape=input_shape, dtype=dtype, name=f"input{idx}", is_input=True
+                ),
+                start_indices=start_indices,
+                end_indices=end_indices,
+            )
+            for idx in range(num_slices)
+        ]
+
+        Y = ops.concatenate()(slice_outputs, concat_dim)
+
+        Y._attrs["name"] = "y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        test_name = f"slice_scatter_large_inputs_{self.test_count}"
+
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+
+        Y_src_ops = list(Y._attrs["src_ops"])
+        self.assertEqual(len(Y_src_ops), 5)
+        self.assertTrue(all(op._attrs["op"] == "slice_scatter" for op in Y_src_ops))
+
+        input_pt = [
+            get_random_torch_tensor(input_shape, dtype) for _ in range(num_slices)
+        ]
+        slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
+        slice_outputs_pt = [input_i[slice_indices] for input_i in input_pt]
+        y_pt = torch.cat(slice_outputs_pt, concat_dim)
+
+        inputs = {f"input{idx}": input_pt[idx] for idx in range(num_slices)}
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
+        module.run_with_tensors(inputs, [y])
+        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+        self.test_count += 1
+
+    def test_slice_scatter_float(self):
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=0,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=1,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=2,
+            dtype="float",
+        )
+        self._test_slice_scatter(
+            input_shape=[3, 7, 10],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 1, 4],
+            concat_dim=1,
+            dtype="float16",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_large_split.py b/tests/unittest/compiler/test_split_large_split.py
new file mode 100644
index 000000000..8788e3803
--- /dev/null
+++ b/tests/unittest/compiler/test_split_large_split.py
@@ -0,0 +1,156 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import numpy as np
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class SplitLargeSplitTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitLargeSplitTestCase, self).__init__(*args, **kwargs)
+
+    def _run_split(
+        self,
+        *,
+        input_shape,
+        split_size_or_sections,
+        dim=None,
+        input_type="float16",
+        testname=None,
+    ):
+        logging.info(
+            f"Test input shape {input_shape}, "
+            f"split_size_or_sections={split_size_or_sections}, dim={dim}"
+        )
+
+        split_op = ops.split()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Ys_pt = (
+            torch.split(X_pt, split_size_or_sections)
+            if dim is None
+            else torch.split(X_pt, split_size_or_sections, dim)
+        )
+        target = detect_target()
+        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        Ys = (
+            split_op(X, split_size_or_sections)
+            if dim is None
+            else split_op(X, split_size_or_sections, dim)
+        )
+        np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+        y_shapes = []
+        for idx, Y in enumerate(Ys):
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+            y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+            logging.info(f"AITemplate output_{idx} shape: {y_shape}")
+            y_shapes.append(y_shape)
+
+        module = compile_model(Ys, target, "./tmp", testname)
+
+        outputs = {
+            f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors([X_pt], outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_split(self):
+        self._run_split(
+            input_shape=[4096, 128, 64],
+            split_size_or_sections=32,
+            dim=0,
+            testname="split_0",
+        )
+        self._run_split(
+            input_shape=[128, 2048, 64],
+            split_size_or_sections=3,
+            dim=1,
+            testname="split_1",
+        )
+        self._run_split(
+            input_shape=[64, 128, 1024],
+            split_size_or_sections=2,
+            dim=2,
+            testname="split_2",
+        )
+        self._run_split(
+            input_shape=[64, 128, 1024],
+            split_size_or_sections=7,
+            dim=2,
+            testname="split_3",
+        )
+
+    def test_split_with_strided_op(self):
+        input_shape = [64, 128, 1024]
+        split_size_or_sections = 3
+        split_dim = 2
+        strided_op_idx = [100, 200, 300]
+
+        split_op = ops.split()
+        # generate torch reference result
+        X_pt = get_random_torch_tensor(input_shape)
+        Ys_pt = list(torch.split(X_pt, split_size_or_sections, split_dim))
+        for idx in strided_op_idx:
+            Ys_pt[idx] = torch.relu(Ys_pt[idx])
+        target = detect_target()
+        X = Tensor(shape=input_shape, name="input_0", is_input=True)
+        Ys = list(split_op(X, split_size_or_sections, split_dim))
+        np.testing.assert_equal(len(Ys_pt), len(Ys))
+
+        y_shapes = []
+        for idx, Y in enumerate(Ys):
+            if idx in strided_op_idx:
+                Y = ops.elementwise(FuncEnum.RELU)(Y)
+                Ys[idx] = Y
+            Y._attrs["name"] = f"output_{idx}"
+            Y._attrs["is_output"] = True
+
+            y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
+            logging.info(f"AITemplate output_{idx} shape: {y_shape}")
+            y_shapes.append(y_shape)
+
+        module = compile_model(Ys, target, "./tmp", "split_with_strided_ops")
+
+        outputs = {
+            f"output_{idx}": get_torch_empty_tensor(y_shape)
+            for idx, y_shape in enumerate(y_shapes)
+        }
+        module.run_with_tensors([X_pt], outputs)
+
+        for idx, y_pt in enumerate(Ys_pt):
+            y = outputs[f"output_{idx}"]
+            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_split_view_strided.py b/tests/unittest/compiler/test_split_view_strided.py
index a0a96cdc4..b11946f46 100644
--- a/tests/unittest/compiler/test_split_view_strided.py
+++ b/tests/unittest/compiler/test_split_view_strided.py
@@ -20,6 +20,11 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -38,14 +43,17 @@ def _test_split_view_bmm_rcr(
         expected_num_tensors,
         expected_num_ops,
         testname,
+        dtype="float16",
     ):
         T_A = Tensor(
             shape=input_A_shape,
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         T_B = Tensor(
             shape=input_B_shape,
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -75,19 +83,13 @@ def _test_split_view_bmm_rcr(
                 "batch_size": B,
                 "emb_pool_size": M,
             }
-            a = (
-                torch.randn(
-                    *test_utils.get_shape(T_A._attrs["shape"], dim_to_value_dict)
-                )
-                .cuda()
-                .half()
+            a = get_random_torch_tensor(
+                test_utils.get_shape(T_A._attrs["shape"], dim_to_value_dict),
+                dtype,
             )
-            b = (
-                torch.randn(
-                    *test_utils.get_shape(T_B._attrs["shape"], dim_to_value_dict)
-                )
-                .cuda()
-                .half()
+            b = get_random_torch_tensor(
+                test_utils.get_shape(T_B._attrs["shape"], dim_to_value_dict),
+                dtype,
             )
             xs = a.split(split_size_or_sections, split_dim)
             ys = b.split(split_size_or_sections, split_dim)
@@ -98,7 +100,7 @@ def _test_split_view_bmm_rcr(
                 c = torch.bmm(x, y.permute(0, 2, 1))
                 cs.append(c)
 
-            ys = [torch.empty(y_pt.size()).cuda().half() for y_pt in cs]
+            ys = [get_torch_empty_tensor(y_pt.size(), dtype) for y_pt in cs]
             module.run_with_tensors({"input0": a, "input1": b}, ys)
 
             for y, y_pt in zip(ys, cs):
@@ -175,6 +177,46 @@ def test_split_view_bmm_rcr_fusion(self):
             testname="test_split_bmm_rcr_dynamic_bm_non_fusible",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_view_bmm_rcr_fusion_fp32_sm80(self):
+        b_dim = shape_utils.gen_int_var([1, 1024], "batch_size")
+        m_dim = shape_utils.gen_int_var([100, 200], "emb_pool_size")
+
+        # bmm_rcr dynamic M fusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[1],
+            Ms=[100, 105, 160],
+            input_A_shape=[1, m_dim, 10, 2],
+            input_B_shape=[1, 6, 10, 2],
+            split_size_or_sections=10,
+            split_dim=2,
+            reshape_A=[1, -1, 20],
+            reshape_B=[1, 6, 20],
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            testname="test_split_bmm_rcr_dynamic_m_fusible_float",
+            dtype="float",
+        )
+        # bmm_rcr dynamic M, B unfusible
+        self._test_split_view_bmm_rcr(
+            ops.bmm_rcr,
+            Bs=[2, 4, 5, 10],
+            Ms=[100, 200],
+            input_A_shape=[b_dim, m_dim, 10, 8],
+            input_B_shape=[b_dim, m_dim, 10, 8],
+            split_size_or_sections=2,
+            split_dim=2,
+            reshape_A=[-1, 10, 16],
+            reshape_B=[-1, 10, 16],
+            expected_num_tensors=27,
+            expected_num_ops=17,
+            testname="test_split_bmm_rcr_dynamic_bm_non_fusible_float",
+            dtype="float",
+        )
+
+
+filter_test_cases_by_test_env(SplitViewStridedOpTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_group_gemm.py b/tests/unittest/compiler/test_strided_group_gemm.py
index 6824984d3..61267d475 100644
--- a/tests/unittest/compiler/test_strided_group_gemm.py
+++ b/tests/unittest/compiler/test_strided_group_gemm.py
@@ -23,14 +23,22 @@
 from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedGroupGemmTestCase(unittest.TestCase):
-    def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
+    def _test_strided_group_gemm(
+        self, M, N1, K1, N2, K2, N3, test_name, dtype="float16"
+    ):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = M
@@ -40,15 +48,15 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
 
-        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+        X3 = Tensor(shape=[M3, N3], dtype=dtype, name="x3", is_input=True)
 
         group_gemm_op = ops.group_gemm_rcr()
         Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1], [X2, W2]])
@@ -67,17 +75,17 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        X3_pt = torch.randn(M3, N3).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        X3_pt = get_random_torch_tensor([M3, N3], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -87,7 +95,7 @@ def _test_strided_group_gemm(self, M, N1, K1, N2, K2, N3, test_name):
             "w2": W2_pt,
             "x3": X3_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -106,13 +114,13 @@ def test_strided_group_gemm(self):
         )
 
     def _test_strided_group_gemm_bias(
-        self, M, N1, K1, N2, K2, N3, test_name, input_first
+        self, M, N1, K1, N2, K2, N3, test_name, input_first, dtype="float16"
     ):
         # input_first determines if we place input tensor (X3) to be the first
         # concatenated tensor or not
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
         M1 = M
         M2 = M
@@ -121,17 +129,17 @@ def _test_strided_group_gemm_bias(
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
 
-        X3 = Tensor(shape=[M3, N3], dtype="float16", name="x3", is_input=True)
+        X3 = Tensor(shape=[M3, N3], dtype=dtype, name="x3", is_input=True)
 
         group_gemm_op = ops.group_gemm_rcr_bias()
         Y1, Y2 = group_gemm_op(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
@@ -158,13 +166,13 @@ def _test_strided_group_gemm_bias(
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
-        X3_pt = torch.randn(M3, N3).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        B1_pt = get_random_torch_tensor([N1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        B2_pt = get_random_torch_tensor([N2], dtype)
+        X3_pt = get_random_torch_tensor([M3, N3], dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         if input_first:
@@ -173,7 +181,7 @@ def _test_strided_group_gemm_bias(
             Y_pt = torch.cat([Y1_pt, Y2_pt, X3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -185,7 +193,7 @@ def _test_strided_group_gemm_bias(
             "b2": B2_pt,
             "x3": X3_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -211,8 +219,32 @@ def test_strided_group_gemm_bias(self):
             input_first=False,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_group_gemm_float(self):
+        self._test_strided_group_gemm(
+            M=8,
+            N1=32,
+            K1=32,
+            N2=4,
+            K2=4,
+            N3=3,
+            test_name="strided_group_gemm_rcr_cat_float2",
+            dtype="float",
+        )
+        self._test_strided_group_gemm_bias(
+            M=128,
+            N1=32,
+            K1=32,
+            N2=64,
+            K2=16,
+            N3=8,
+            test_name="strided_group_gemm_rcr_bias_cat_float1",
+            input_first=False,
+            dtype="float",
+        )
+
     # test if we update epilogue alignment values correctly
-    def test_strided_group_gemm_epilogue_alignment(self):
+    def _test_strided_group_gemm_epilogue_alignment(self, dtype="float16"):
         # Note that we have to force profiling in ci. Otherwise, we would not
         # be able to fetch cached config.
         target = detect_target()
@@ -228,8 +260,9 @@ def test_strided_group_gemm_epilogue_alignment(self):
             N2=62,
             K2=16,
             N3=2,
-            test_name="strided_group_gemm_rcr_epilogue_alignment1",
+            test_name=f"strided_group_gemm_rcr_epilogue_alignment_{dtype}_1",
             input_first=True,
+            dtype=dtype,
         )
         # a bigger epilogue alignment value 4
         self._test_strided_group_gemm_bias(
@@ -239,8 +272,9 @@ def test_strided_group_gemm_epilogue_alignment(self):
             N2=62,
             K2=16,
             N3=4,
-            test_name="strided_group_gemm_rcr_epilogue_alignment2",
+            test_name=f"strided_group_gemm_rcr_epilogue_alignment_{dtype}_2",
             input_first=True,
+            dtype=dtype,
         )
 
         # restore old env
@@ -250,6 +284,13 @@ def test_strided_group_gemm_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    def test_strided_group_gemm_epilogue_alignment(self):
+        self._test_strided_group_gemm_epilogue_alignment()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_group_gemm_epilogue_alignment_float(self):
+        self._test_strided_group_gemm_epilogue_alignment(dtype="float")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_group_layernorm.py b/tests/unittest/compiler/test_strided_group_layernorm.py
index b3b7ecd5a..afdf599d6 100644
--- a/tests/unittest/compiler/test_strided_group_layernorm.py
+++ b/tests/unittest/compiler/test_strided_group_layernorm.py
@@ -14,14 +14,17 @@
 #
 import itertools
 import unittest
-import uuid
 from typing import List
 
 import torch
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import shape_utils, torch_utils
 
 
 def build_ait_module(
@@ -35,9 +38,10 @@ def build_ait_module(
     beta_is_none,
     fuse_sigmoid_mul,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_group_layernorm",
+    test_name="strided_group_layernorm",
 ):
     target = detect_target()
     inputs = [
@@ -84,11 +88,13 @@ def build_ait_module(
     for i, output in enumerate(outputs):
         output._attrs["is_output"] = True
         output._attrs["name"] = f"output_{i}"
+    dll_name = f"test_{test_id}.so"
     return compile_model(
         outputs,
         target,
         workdir,
         test_name,
+        dll_name=dll_name,
     )
 
 
@@ -164,6 +170,10 @@ def eval_pt(
 
 
 class SliceGroupLayerNormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceGroupLayerNormTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_slice_group_layer_norm(
         self,
         *,
@@ -176,6 +186,7 @@ def _test_slice_group_layer_norm(
         eps=1e-5,
         start_indices: List[int] = (0,),
         end_indices: List[int] = (None,),
+        dtype: str = "float16",
     ):
         input_rank = 1 + len(input_nonbatch_shapes[0])
         if 1 == len(start_indices) and len(start_indices) != input_rank:
@@ -196,13 +207,15 @@ def _test_slice_group_layer_norm(
 
         ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             **_layernorm_common_params,
+            test_id=self._test_id,
+            ait_dtype=dtype,
         )
+        self._test_id += 1
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
             pt_tensors = eval_pt(
-                batch_size=batch_size,
-                **_layernorm_common_params,
+                batch_size=batch_size, **_layernorm_common_params, dtype=pt_dtype
             )
             ait_inputs = {
                 k: v
@@ -329,6 +342,175 @@ def test_middle_slice_group_layer_norm_fuse_sigmoid_mul_float16(self):
                 fuse_sigmoid_mul=True,
             )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_slice_group_layer_norm_float(self):
+        self._test_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=3,
+            gamma_is_none=True,
+            beta_is_none=True,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_middle_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=2,
+            gamma_is_none=True,
+            beta_is_none=False,
+            fuse_sigmoid_mul=False,
+            dtype="float32",
+        )
+        self._test_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=1,
+            gamma_is_none=False,
+            beta_is_none=True,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+        self._test_middle_slice_group_layer_norm_kernels(
+            n_normalize_over_last_dims=3,
+            gamma_is_none=False,
+            beta_is_none=False,
+            fuse_sigmoid_mul=True,
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_group_layernorm_no_cuda_illegal_memory_access(self):
+        """
+        This subgraph has led to CUDA illegal memory issues before.
+        Adding it as a unit test to ensure there are no regressions.
+        """
+        batch_size = IntVar(values=[1, 2048], name="batch_size")
+
+        unsqueeze_46_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_46_0",
+        )
+        unsqueeze_58_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_58_0",
+        )
+        unsqueeze_70_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_70_0",
+        )
+        unsqueeze_131_0 = Tensor(
+            shape=[batch_size, 3, 1],
+            is_input=True,
+            name="unsqueeze_131_0",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias",
+        )
+
+        unsqueeze_83_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_83_0",
+        )
+        unsqueeze_95_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_95_0",
+        )
+        unsqueeze_107_0 = Tensor(
+            shape=[batch_size, 7680, 1],
+            is_input=True,
+            name="unsqueeze_107_0",
+        )
+        unsqueeze_358_0 = Tensor(
+            shape=[batch_size, 3, 1],
+            is_input=True,
+            name="unsqueeze_358_0",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight",
+        )
+        main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias = Tensor(
+            shape=[IntImm(256)],
+            is_input=True,
+            name="main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias",
+        )
+
+        concatenate_71_0 = ops.concatenate()(
+            inputs=[unsqueeze_46_0, unsqueeze_58_0, unsqueeze_70_0],
+            dim=2,
+        )
+        bmm_rrr_132_0 = ops.bmm_rrr()(concatenate_71_0, unsqueeze_131_0)
+        reshape_133_0 = ops.reshape()(bmm_rrr_132_0, shape=[-1, 30, 256])
+        layernorm_134_0 = ops.layernorm(normalized_shape=[IntImm(256)])(
+            reshape_133_0,
+            main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight,
+            main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias,
+        )
+        permute021_136_0 = ops.permute021()(layernorm_134_0)
+
+        concatenate_108_0 = ops.concatenate()(
+            inputs=[unsqueeze_83_0, unsqueeze_95_0, unsqueeze_107_0],
+            dim=2,
+        )
+        bmm_rrr_359_0 = ops.bmm_rrr()(concatenate_108_0, unsqueeze_358_0)
+        reshape_360_0 = ops.reshape()(bmm_rrr_359_0, shape=[-1, 30, 256])
+        layernorm_361_0 = ops.layernorm(normalized_shape=[IntImm(256)])(
+            reshape_360_0,
+            main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight,
+            main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias,
+        )
+        permute021_363_0 = ops.permute021()(layernorm_361_0)
+
+        outputs = [permute021_136_0, permute021_363_0]
+
+        for i, output in enumerate(outputs):
+            output._attrs["is_output"] = True
+            output._attrs["name"] = f"output_{i}"
+
+        model = compile_model(
+            outputs,
+            detect_target(),
+            "./tmp",
+            "test_group_layernorm_repro",
+        )
+
+        pt_inputs = {
+            "unsqueeze_46_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_58_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_70_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_131_0": get_random_torch_tensor(shape=[1024, 3, 1]),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_weight": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_1_dime_shared_arch_layer_norm__norm_bias": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "unsqueeze_83_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_95_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_107_0": get_random_torch_tensor(shape=[1024, 7680, 1]),
+            "unsqueeze_358_0": get_random_torch_tensor(shape=[1024, 3, 1]),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_weight": get_random_torch_tensor(
+                shape=[256]
+            ),
+            "main_module_base_forward_module_over_arch_bottom_arch_list_0_dime_shared_arch_layer_norm__norm_bias": get_random_torch_tensor(
+                shape=[256]
+            ),
+        }
+        pt_outputs = {
+            "output_0": get_torch_empty_tensor(shape=[1024, 256, 30]),
+            "output_1": get_torch_empty_tensor(shape=[1024, 256, 30]),
+        }
+
+        model.run_with_tensors(pt_inputs, pt_outputs)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_layernorm.py b/tests/unittest/compiler/test_strided_layernorm.py
index 43b270669..ff1cc394d 100644
--- a/tests/unittest/compiler/test_strided_layernorm.py
+++ b/tests/unittest/compiler/test_strided_layernorm.py
@@ -12,9 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import itertools
 import unittest
-import uuid
 from typing import List
 
 import torch
@@ -22,7 +20,8 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.utils import shape_utils, torch_utils
+from parameterized import param, parameterized
 
 
 def build_ait_module(
@@ -36,11 +35,15 @@ def build_ait_module(
     beta_is_none,
     fuse_sigmoid_mul,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_layernorm",
+    test_name="strided_layernorm",
+    use_welford_algorithm=False,
 ):
-    target = detect_target()
+    target = detect_target(
+        layernorm_use_welford_algorithm=use_welford_algorithm,
+    )
     X0 = Tensor(
         shape=[
             shape_utils.gen_int_var_min_max(values=batch_sizes, name="input_batch"),
@@ -83,11 +86,13 @@ def build_ait_module(
         output = ops.layernorm()(X1, X2, X3, layernorm_weight_shape, eps)
     output._attrs["is_output"] = True
     output._attrs["name"] = "output"
+    dll_name = f"test_{test_id}.so"
     return compile_model(
         output,
         target,
         workdir,
-        test_name,
+        f"{test_name}_{test_id}",
+        dll_name=dll_name,
     )
 
 
@@ -132,6 +137,11 @@ def eval_pt(
 
 
 class SliceLayerNormTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SliceLayerNormTestCase, self).__init__(*args, **kwargs)
+        torch.manual_seed(0)
+        self._test_id = 0
+
     def _test_slice_layer_norm(
         self,
         *,
@@ -144,8 +154,10 @@ def _test_slice_layer_norm(
         eps=1e-5,
         start_indices: List[int] = (0,),
         end_indices: List[int] = (None,),
+        dtype: str = "float16",
+        test_name="test_slice_layer_norm",
+        use_welford_algorithm=False,
     ):
-
         input_rank = 1 + len(input_nonbatch_shape)
         if 1 == len(start_indices) and len(start_indices) != input_rank:
             start_indices = [start_indices[0]] * input_rank
@@ -165,42 +177,47 @@ def _test_slice_layer_norm(
 
         ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             **_layernorm_common_params,
+            test_id=self._test_id,
+            ait_dtype=dtype,
+            test_name=f"{test_name}_{dtype}",
+            use_welford_algorithm=use_welford_algorithm,
         )
+        self._test_id += 1
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
             pt_tensors = eval_pt(
-                batch_size=batch_size,
-                **_layernorm_common_params,
+                batch_size=batch_size, **_layernorm_common_params, dtype=pt_dtype
             )
             ait_inputs = {
                 k: v for k, v in pt_tensors.items() if v is not None and k != "output"
             }
             ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
             ait_module.run_with_tensors(ait_inputs, ait_outputs)
-
-            self.assertTrue(
-                torch.allclose(
-                    ait_outputs["output"], pt_tensors["output"], atol=1e-3, rtol=1e-3
-                )
+            torch.testing.assert_close(
+                ait_outputs["output"],
+                pt_tensors["output"],
+                atol=1e-3,
+                rtol=1e-3,
             )
 
     def _test_slice_layer_norm_kernels(
         self,
         **kwargs,
     ):
-        for start_indices, end_indices, input_nonbatch_shape in (
+        for start_indices, end_indices, input_nonbatch_shape, use_welford_algorithm in (
             # (cuda-half4) kernel
-            ((0, 0, 0, 4), (None, None, None, 36), (4, 1, 40)),
+            ((0, 0, 0, 4), (None, None, None, 36), (4, 1, 40), False),
             # (generic n < 1024) kernel
-            ((0, 0, 0, 11), (None, None, None, 13), (4, 1, 15)),
+            ((0, 0, 0, 11), (None, None, None, 13), (4, 1, 15), False),
             # (cuda-half; block size = 512) kernel
-            ((0, 0, 0, 1), (None, None, None, 1026), (4, 1, 1027)),
+            ((0, 0, 0, 1), (None, None, None, 1026), (4, 1, 1027), True),
         ):
             self._test_slice_layer_norm(
                 start_indices=start_indices,
                 end_indices=end_indices,
                 input_nonbatch_shape=input_nonbatch_shape,
+                use_welford_algorithm=use_welford_algorithm,
                 **kwargs,
             )
 
@@ -208,88 +225,157 @@ def _test_middle_slice_layer_norm_kernels(
         self,
         **kwargs,
     ):
-        for start_indices, end_indices, input_nonbatch_shape in (
+        for start_indices, end_indices, input_nonbatch_shape, use_welford_algorithm in (
             # (cuda-half4) kernel
-            ((0, 0, 4, 0), (None, None, 36, None), (2, 40, 4)),
+            ((0, 0, 4, 0), (None, None, 36, None), (2, 40, 4), False),
             # (generic n < 1024) kernel
-            ((0, 0, 11, 0), (None, None, 13, None), (2, 15, 2)),
+            ((0, 0, 11, 0), (None, None, 13, None), (2, 15, 2), True),
             # (cuda-half; block size = 512) kernel
-            ((0, 0, 1, 0), (None, None, 1026, None), (2, 1027, 2)),
+            ((0, 0, 1, 0), (None, None, 1026, None), (2, 1027, 2), False),
         ):
             self._test_slice_layer_norm(
                 start_indices=start_indices,
                 end_indices=end_indices,
                 input_nonbatch_shape=input_nonbatch_shape,
+                use_welford_algorithm=use_welford_algorithm,
                 **kwargs,
             )
 
-    def test_slice_layer_norm_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (1, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=False,
-            )
+    @parameterized.expand(
+        [
+            param(0, 1, True, True),
+            param(1, 1, True, False),
+            param(2, 1, False, True),
+            param(3, 1, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_slice_layer_norm_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=False,
+            test_name=f"test_slice_layer_norm_float16_{test_id}",
+        )
 
-    def test_middle_slice_layer_norm_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (2, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_middle_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=False,
-            )
+    @parameterized.expand(
+        [
+            param(0, 2, True, True),
+            param(1, 2, True, False),
+            param(2, 2, False, True),
+            param(3, 2, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_middle_slice_layer_norm_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
+        self._test_middle_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=False,
+            test_name=f"test_middle_slice_layer_norm_float16_{test_id}",
+        )
 
-    def test_slice_layer_norm_fuse_sigmoid_mul_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (1, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=True,
-            )
+    @parameterized.expand(
+        [
+            param(0, 1, True, True),
+            param(1, 1, True, False),
+            param(2, 1, False, True),
+            param(3, 1, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_slice_layer_norm_fuse_sigmoid_mul_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=True,
+            test_name=f"test_slice_layer_norm_fuse_sigmoid_mul_float16_{test_id}",
+        )
 
-    def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(self):
-        for (
-            n_normalize_over_last_dims,
-            gamma_is_none,
-            beta_is_none,
-        ) in itertools.product(
-            (2, 3),
-            (True, False),
-            (True, False),
-        ):
-            self._test_middle_slice_layer_norm_kernels(
-                n_normalize_over_last_dims=n_normalize_over_last_dims,
-                gamma_is_none=gamma_is_none,
-                beta_is_none=beta_is_none,
-                fuse_sigmoid_mul=True,
-            )
+    @parameterized.expand(
+        [
+            param(0, 2, True, True),
+            param(1, 2, True, False),
+            param(2, 2, False, True),
+            param(3, 2, False, False),
+            param(4, 3, True, True),
+            param(5, 3, True, False),
+            param(6, 3, False, True),
+            param(7, 3, False, False),
+        ]
+    )
+    def test_middle_slice_layer_norm_fuse_sigmoid_mul_float16(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+    ):
+        self._test_middle_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=True,
+            test_name=f"test_middle_slice_layer_norm_fuse_sigmoid_mul_float16_{test_id}",
+        )
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    )
+    @parameterized.expand(
+        [
+            param(0, 1, True, True, False),
+            param(1, 2, True, False, False),
+            param(2, 3, False, True, True),
+            param(3, 2, False, False, True),
+        ]
+    )
+    def test_slice_layer_norm_float32(
+        self,
+        test_id,
+        n_normalize_over_last_dims,
+        gamma_is_none,
+        beta_is_none,
+        fuse_sigmoid_mul,
+    ):
+        self._test_slice_layer_norm_kernels(
+            n_normalize_over_last_dims=n_normalize_over_last_dims,
+            gamma_is_none=gamma_is_none,
+            beta_is_none=beta_is_none,
+            fuse_sigmoid_mul=fuse_sigmoid_mul,
+            dtype="float32",
+            test_name=f"test_slice_layer_norm_float32_{test_id}",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_strided_layernorm_reshape.py b/tests/unittest/compiler/test_strided_layernorm_reshape.py
index 2cc725c4a..3854d5ef0 100644
--- a/tests/unittest/compiler/test_strided_layernorm_reshape.py
+++ b/tests/unittest/compiler/test_strided_layernorm_reshape.py
@@ -13,22 +13,22 @@
 #  limitations under the License.
 #
 import unittest
-import uuid
 
 import torch
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import shape_utils
+from aitemplate.utils import shape_utils, torch_utils
 
 
 def build_ait_module(
     *,
     batch_sizes,
     eps,
+    test_id,
     ait_dtype="float16",
     workdir="./tmp",
-    test_name="slice_layernorm_reshape",
+    test_name="strided_layernorm_reshape",
 ):
     input_nonbatch_shape = [6912]
     target = detect_target()
@@ -67,15 +67,11 @@ def build_ait_module(
 
     output._attrs["is_output"] = True
     output._attrs["name"] = "output"
+    dll_name = f"test_{test_id}.so"
     return (
         inputs,
         output,
-        compile_model(
-            output,
-            target,
-            workdir,
-            test_name,
-        ),
+        compile_model(output, target, workdir, test_name, dll_name=dll_name),
     )
 
 
@@ -106,9 +102,14 @@ def eval_pt(
 
 
 class SliceLayerNormReshapeTestCase(unittest.TestCase):
-    def test_slice_layer_norm_reshape(
+    def __init__(self, *args, **kwargs):
+        super(SliceLayerNormReshapeTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_slice_layer_norm_reshape(
         self,
         *,
+        dtype="float16",
         batch_sizes=(3, 4),
         eps=1e-5,
         atol=1e-3,
@@ -116,9 +117,11 @@ def test_slice_layer_norm_reshape(
     ):
         ait_in_node, ait_out_node, ait_module = build_ait_module(
             batch_sizes=batch_sizes,
-            workdir=uuid.uuid4().hex,
             eps=eps,
+            test_id=self._test_id,
+            ait_dtype=dtype,
         )
+        self._test_id += 1
 
         for op_name in (
             next(iter(ait_in_node._attrs["dst_ops"]))._attrs["name"],
@@ -126,11 +129,9 @@ def test_slice_layer_norm_reshape(
         ):
             self.assertRegex(op_name, "layernorm")
 
+        pt_dtype = torch_utils.string_to_torch_dtype(dtype)
         for batch_size in batch_sizes:
-            pt_tensors = eval_pt(
-                batch_size=batch_size,
-                eps=eps,
-            )
+            pt_tensors = eval_pt(batch_size=batch_size, eps=eps, dtype=pt_dtype)
             ait_inputs = {k: v for k, v in pt_tensors.items() if k != "output"}
             ait_outputs = {"output": torch.empty_like(pt_tensors["output"])}
             ait_module.run_with_tensors(ait_inputs, ait_outputs)
@@ -141,6 +142,15 @@ def test_slice_layer_norm_reshape(
                 )
             )
 
+    def test_slice_layer_norm_reshape_float16(self):
+        self._test_slice_layer_norm_reshape()
+
+    @unittest.skipIf(
+        detect_target().name() != "cuda", "fp32 is only supported in CUDA backend"
+    )
+    def test_slice_layer_norm_reshape_float32(self):
+        self._test_slice_layer_norm_reshape(dtype="float32")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_op_cat_pattern.py b/tests/unittest/compiler/test_strided_op_cat_pattern.py
index 38ae403d4..8245b9e0a 100644
--- a/tests/unittest/compiler/test_strided_op_cat_pattern.py
+++ b/tests/unittest/compiler/test_strided_op_cat_pattern.py
@@ -27,7 +27,11 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils, shape_utils
 
 
@@ -44,6 +48,7 @@ def _fused_elementwise_e2e_helper(
         m2: int,
         m3: int,
         k: int,
+        dtype: str = "float16",
     ):
         # Construct one graph with 2 fused_elementwises + 1 cat.
         batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch_0")
@@ -51,25 +56,25 @@ def _fused_elementwise_e2e_helper(
 
         X1 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m1), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             value=3.0,
         )
         X3 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m2), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
         X9 = Tensor(
             shape=[batch0_dim, batch1_dim, IntImm(m3), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="input2",
             is_input=True,
         )
@@ -88,7 +93,7 @@ def _fused_elementwise_e2e_helper(
             [X8],
             target,
             "./tmp",
-            "fused_elementwise_cat_m1_{}_m2_{}_m3_{}_k_{}".format(m1, m2, m3, k),
+            f"fused_elementwise_cat_m1_{m1}_m2_{m2}_m3_{m3}_k_{k}_{dtype}",
         ) as module:
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
@@ -98,9 +103,9 @@ def _fused_elementwise_e2e_helper(
 
             # Run PyTorch baseline.
             for sizes in itertools.product(batch0_sizes, batch1_sizes):
-                x1_pt = torch.randn(sizes[0], sizes[1], m1, k).cuda().half()
-                x3_pt = torch.randn(sizes[0], sizes[1], m2, k).cuda().half()
-                x9_pt = torch.randn(sizes[0], sizes[1], m3, k).cuda().half()
+                x1_pt = get_random_torch_tensor([sizes[0], sizes[1], m1, k], dtype)
+                x3_pt = get_random_torch_tensor([sizes[0], sizes[1], m2, k], dtype)
+                x9_pt = get_random_torch_tensor([sizes[0], sizes[1], m3, k], dtype)
                 x5_pt = torch.tanh(x1_pt + 3.0)
                 x6_pt = torch.tanh(x3_pt)
                 x7_pt = torch.cat([x5_pt, x6_pt, x9_pt], dim=2)
@@ -108,8 +113,8 @@ def _fused_elementwise_e2e_helper(
 
                 # Run AITemplate module.
                 inputs = [x1_pt, x3_pt, x9_pt]
-                x8 = (
-                    torch.empty([sizes[0] * sizes[1], (m1 + m2 + m3) * k]).cuda().half()
+                x8 = get_torch_empty_tensor(
+                    [sizes[0] * sizes[1], (m1 + m2 + m3) * k], dtype
                 )
                 module.run_with_tensors(inputs, [x8])
 
@@ -172,20 +177,116 @@ def test_elementwise(self):
             batch0_sizes=[2, 59, 88], batch1_sizes=[20], m1=12, m2=16, m3=4, k=1
         )
 
-    def test_elementwise_cat_1(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_elementwise_float(self):
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[1024], batch1_sizes=[2], m1=8, m2=16, m3=8, k=1, dtype="float"
+        )
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=6,
+            m2=8,
+            m3=2,
+            k=1,
+            dtype="float",
+        )
+        # float v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[100, 30],
+            batch1_sizes=[2],
+            m1=1,
+            m2=1,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float2 v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[30],
+            batch1_sizes=[2, 88, 99],
+            m1=2,
+            m2=3,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float v.s. float2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[77, 89, 188],
+            batch1_sizes=[1, 2, 4],
+            m1=3,
+            m2=2,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float4 v.s. float
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2],
+            batch1_sizes=[1, 3, 1024],
+            m1=4,
+            m2=5,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # float4 v.s. float2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2],
+            batch1_sizes=[1, 3, 1024],
+            m1=4,
+            m2=6,
+            m3=8,
+            k=1,
+            dtype="float",
+        )
+        # Offset alignment tests.
+        # offset alignment = 1
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=3,
+            m2=4,
+            m3=5,
+            k=1,
+            dtype="float",
+        )
+        # offset alignment = 2
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=6,
+            m2=8,
+            m3=2,
+            k=1,
+            dtype="float",
+        )
+        # offset alignment = 4
+        self._fused_elementwise_e2e_helper(
+            batch0_sizes=[2, 59, 88],
+            batch1_sizes=[20],
+            m1=12,
+            m2=16,
+            m3=4,
+            k=1,
+            dtype="float",
+        )
+
+    def _test_elementwise_cat_1(self, dtype="float16"):
         BATCH_SIZE = 1024
         NUM_FLOAT_FEATURES = 1456
 
         X1 = Tensor(
             shape=[IntImm(BATCH_SIZE), IntImm(NUM_FLOAT_FEATURES)],
-            dtype="float16",
+            dtype=dtype,
             name="float_features",
             is_input=True,
         )
         X2 = ops.elementwise(FuncEnum.SIGN)(X1)  # Sign
         X3 = ops.elementwise(FuncEnum.ABS)(X1)  # Abs
         X4 = ops.elementwise(FuncEnum.LOGE)(
-            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], value=1.0))
+            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], dtype=dtype, value=1.0))
         )  # Log1p
         X5 = ops.elementwise(FuncEnum.MUL)(X2, X4)  # Mul
         X6 = ops.concatenate()([X5, X1], dim=1)  # Concat
@@ -198,9 +299,11 @@ def test_elementwise_cat_1(self):
             [X6],
             target,
             "./tmp",
-            "test_elementwise_cat_1",
+            f"test_elementwise_cat_1_{dtype}",
         ) as module:
-            float_features = torch.randn(BATCH_SIZE, NUM_FLOAT_FEATURES).cuda().half()
+            float_features = get_random_torch_tensor(
+                [BATCH_SIZE, NUM_FLOAT_FEATURES], dtype
+            )
             x1_pt = torch.sign(float_features)  # Sign
             x2_pt = torch.abs(float_features)  # Abs
             x3_pt = torch.log1p(x2_pt)  # Log1p
@@ -208,12 +311,65 @@ def test_elementwise_cat_1(self):
             x5_pt = torch.cat([x4_pt, float_features], dim=1)  # Concat
 
             # Run AITemplate module.
-            x6 = torch.empty(x5_pt.size()).cuda().half()
+            x6 = get_torch_empty_tensor(x5_pt.size(), dtype)
             module.run_with_tensors([float_features], [x6])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
 
+    def test_elementwise_cat_1(self):
+        self._test_elementwise_cat_1()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_elementwise_cat_1_float(self):
+        self._test_elementwise_cat_1(dtype="float")
+
+    def test_elementwise_cat_non_fusion(self):
+        BATCH_SIZE = 1024
+        NUM_FLOAT_FEATURES = 1456
+
+        X1 = Tensor(
+            shape=[IntImm(BATCH_SIZE), IntImm(NUM_FLOAT_FEATURES)],
+            name="float_features",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.SIGN)(X1)  # Sign
+        X3 = ops.elementwise(FuncEnum.ABS)(X1)  # Abs
+        X4 = ops.elementwise(FuncEnum.LOGE)(
+            ops.elementwise(FuncEnum.ADD)(X3, Tensor(shape=[], value=1.0))
+        )  # Log1p
+        X5 = ops.elementwise(FuncEnum.MUL)(X2, X4)  # Mul
+        X5._attrs["name"] = "intermediate_out"
+        X5._attrs["is_output"] = True
+        X6 = ops.concatenate()([X5, X1], dim=1)  # Concat
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(
+            [X5, X6],
+            target,
+            "./tmp",
+            "test_elementwise_cat_1_non_fusion",
+        ) as module:
+            float_features = get_random_torch_tensor([BATCH_SIZE, NUM_FLOAT_FEATURES])
+            x1_pt = torch.sign(float_features)  # Sign
+            x2_pt = torch.abs(float_features)  # Abs
+            x3_pt = torch.log1p(x2_pt)  # Log1p
+            x4_pt = x1_pt * x3_pt  # Mul
+            x5_pt = torch.cat([x4_pt, float_features], dim=1)  # Concat
+
+            # Run AITemplate module.
+            x6 = get_torch_empty_tensor(x5_pt.size())
+            x5 = get_torch_empty_tensor(x4_pt.size())
+            module.run_with_tensors(
+                [float_features], {"output0": x6, "intermediate_out": x5}
+            )
+
+            # Do comparisons.
+            self.assertTrue(torch.allclose(x6, x5_pt, atol=1e-2, rtol=1e-2))
+
     def _fused_gemm_e2e_helper(
         self,
         m: int,
@@ -224,81 +380,82 @@ def _fused_gemm_e2e_helper(
         m2: int = -1,
         cat_dim: int = 1,
         no_fuse: bool = False,
+        dtype: str = "float16",
     ):
         # Construct one graph with 3 gemms + 1 cat.
         nd_gemm = m2 > 0
         if nd_gemm:
             X1 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X1",
                 is_input=True,
             )
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
             X3 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X3",
                 is_input=True,
             )
             X4 = Tensor(
                 shape=[IntImm(m), IntImm(m2), IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="X4",
                 is_input=True,
             )
         else:
             X1 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X1",
                 is_input=True,
             )
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
             X3 = Tensor(
                 shape=[IntImm(m), IntImm(k)],
-                dtype="float16",
+                dtype=dtype,
                 name="X3",
                 is_input=True,
             )
             X4 = Tensor(
                 shape=[IntImm(m), IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="X4",
                 is_input=True,
             )
 
         W1 = Tensor(
             shape=[IntImm(n1), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W1",
             is_input=True,
         )
         W2 = Tensor(
             shape=[IntImm(n2), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W2",
             is_input=True,
         )
         B2 = Tensor(
             shape=[IntImm(n2)],
-            dtype="float16",
+            dtype=dtype,
             name="B2",
             is_input=True,
         )
         W3 = Tensor(
             shape=[IntImm(k), IntImm(n3)],
-            dtype="float16",
+            dtype=dtype,
             name="W3",
             is_input=True,
         )
@@ -317,9 +474,8 @@ def _fused_gemm_e2e_helper(
             [X9],
             target,
             "./tmp",
-            "fused_gemm_m_{}_k_{}_n1_{}_n2_{}_n3_{}".format(m, k, n1, n2, n3),
+            f"fused_gemm_m_{m}_k_{k}_n1_{n1}_n2_{n2}_n3_{n3}_{dtype}",
         ) as module:
-
             if not no_fuse:
                 # Verify the generated graph.
                 sorted_graph = module.debug_sorted_graph
@@ -329,21 +485,21 @@ def _fused_gemm_e2e_helper(
 
             if nd_gemm:
                 # Run PyTorch baseline.
-                x1_pt = torch.randn(m, m2, k).cuda().half()
-                x2_pt = torch.randn(m, m2, k).cuda().half()
-                x3_pt = torch.randn(m, m2, k).cuda().half()
-                x4_pt = torch.randn(m, m2, n2).cuda().half()
+                x1_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x2_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x3_pt = get_random_torch_tensor([m, m2, k], dtype)
+                x4_pt = get_random_torch_tensor([m, m2, n2], dtype)
             else:
                 # Run PyTorch baseline.
-                x1_pt = torch.randn(m, k).cuda().half()
-                x2_pt = torch.randn(m, k).cuda().half()
-                x3_pt = torch.randn(m, k).cuda().half()
-                x4_pt = torch.randn(m, n2).cuda().half()
+                x1_pt = get_random_torch_tensor([m, k], dtype)
+                x2_pt = get_random_torch_tensor([m, k], dtype)
+                x3_pt = get_random_torch_tensor([m, k], dtype)
+                x4_pt = get_random_torch_tensor([m, n2], dtype)
 
-            w1_pt = torch.randn(n1, k).cuda().half()
-            w2_pt = torch.randn(n2, k).cuda().half()
-            b2_pt = torch.randn(n2).cuda().half()
-            w3_pt = torch.randn(k, n3).cuda().half()
+            w1_pt = get_random_torch_tensor([n1, k], dtype)
+            w2_pt = get_random_torch_tensor([n2, k], dtype)
+            b2_pt = get_random_torch_tensor([n2], dtype)
+            w3_pt = get_random_torch_tensor([k, n3], dtype)
 
             x5_pt = torch.nn.functional.linear(x1_pt, w1_pt)
             x6_pt = torch.nn.functional.linear(x2_pt, w2_pt, b2_pt)
@@ -368,7 +524,7 @@ def _fused_gemm_e2e_helper(
 
             inputs[name_to_idx["B2"]] = b2_pt
 
-            x9 = torch.empty(x9_pt.shape).cuda().half()
+            x9 = get_torch_empty_tensor(x9_pt.shape, dtype)
             module.run_with_tensors(inputs, [x9])
 
             # Do comparisons.
@@ -385,31 +541,49 @@ def test_gemm(self):
             m=1024, k=256, n1=32, n2=32, n3=32, m2=8, cat_dim=1, no_fuse=True
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gemm_fp32_sm80(self):
+        self._fused_gemm_e2e_helper(m=1024, k=256, n1=5, n2=32, n3=4, dtype="float")
+        self._fused_gemm_e2e_helper(
+            m=1024, k=256, n1=8, n2=16, n3=32, m2=8, cat_dim=2, dtype="float"
+        )
+        self._fused_gemm_e2e_helper(
+            m=1024,
+            k=256,
+            n1=32,
+            n2=32,
+            n3=32,
+            m2=8,
+            cat_dim=1,
+            no_fuse=True,
+            dtype="float",
+        )
+
     def _fused_gemm_alignment_e2e_helper(
-        self, gemm_op, input_n: int, m: int, k: int, n: int
+        self, gemm_op, input_n: int, m: int, k: int, n: int, dtype: str = "float16"
     ):
         # Construct one graph with 1 input + 1 gemm_bias_add + 1 cat.
         Input1 = Tensor(
             shape=[IntImm(m), IntImm(input_n)],
-            dtype="float16",
+            dtype=dtype,
             name="Input1",
             is_input=True,
         )
         X1 = Tensor(
             shape=[IntImm(m), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
         W1 = Tensor(
             shape=[IntImm(n), IntImm(k)],
-            dtype="float16",
+            dtype=dtype,
             name="W1",
             is_input=True,
         )
         B1 = Tensor(
             shape=[IntImm(n)],
-            dtype="float16",
+            dtype=dtype,
             name="B1",
             is_input=True,
         )
@@ -419,7 +593,7 @@ def _fused_gemm_alignment_e2e_helper(
             num_inputs = 5
             X2 = Tensor(
                 shape=[IntImm(m), IntImm(n)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
@@ -440,9 +614,8 @@ def _fused_gemm_alignment_e2e_helper(
             [Y],
             target,
             "./tmp",
-            f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}",
+            f"fused_{gemm_op_kind}_alignment_input_n_{input_n}_m_{m}_n_{n}_k_{k}_{dtype}",
         ) as module:
-
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
             if gemm_op_kind == "gemm_rcr_bias_add":
@@ -462,15 +635,15 @@ def _fused_gemm_alignment_e2e_helper(
             np.testing.assert_equal(concat_op._attrs["input_masks"], [True, False])
 
             # Run PyTorch baseline.
-            input_pt = torch.randn(m, input_n).cuda().half()
-            x1_pt = torch.randn(m, k).cuda().half()
-            w1_pt = torch.randn(n, k).cuda().half()
-            b1_pt = torch.randn(n).cuda().half()
+            input_pt = get_random_torch_tensor([m, input_n], dtype)
+            x1_pt = get_random_torch_tensor([m, k], dtype)
+            w1_pt = get_random_torch_tensor([n, k], dtype)
+            b1_pt = get_random_torch_tensor([n], dtype)
 
             y1_pt = torch.nn.functional.linear(x1_pt, w1_pt)
             y1_pt = torch.nn.functional.linear(x1_pt, w1_pt, b1_pt)
             if gemm_op_kind == "gemm_rcr_bias_add":
-                x2_pt = torch.randn(m, n).cuda().half()
+                x2_pt = get_random_torch_tensor([m, n], dtype)
                 y1_pt += x2_pt
 
             y_pt = torch.cat([input_pt, y1_pt], dim=1)
@@ -485,7 +658,7 @@ def _fused_gemm_alignment_e2e_helper(
             inputs[name_to_idx["W1"]] = w1_pt
             inputs[name_to_idx["B1"]] = b1_pt
 
-            y = torch.empty([m, input_n + n]).cuda().half()
+            y = get_torch_empty_tensor([m, input_n + n], dtype)
             module.run_with_tensors(inputs, [y])
 
             # Do comparisons.
@@ -505,6 +678,15 @@ def test_gemm_alignment(self):
             gemm_op=ops.gemm_rcr_bias_add(), input_n=7, m=4, k=4, n=8
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gemm_alignment_fp32_sm80(self):
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=1, m=2, k=2, n=4, dtype="float"
+        )
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=4, k=4, n=2, dtype="float"
+        )
+
     # Tests to ensure that we correctly update epilogue alignment values
     def test_gemm_update_epilogue_alignment(self):
         # Note that we have to force profiling in ci. Otherwise, we would not
@@ -539,6 +721,41 @@ def test_gemm_update_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    # Tests to ensure that we correctly update epilogue alignment values
+    def test_gemm_update_epilogue_alignment_fp32_sm80(self):
+        # Note that we have to force profiling in ci. Otherwise, we would not
+        # be able to fetch cached config.
+        target = detect_target()
+        old_force_ci = os.environ.get("FORCE_PROFILE", None)
+        if target.in_ci_env():
+            os.environ["FORCE_PROFILE"] = "1"
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=1, m=2, k=2, n=4, dtype="float"
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias(), input_n=4, m=2, k=2, n=4, dtype="float"
+        )
+
+        # a smaller epilogue alignment 1
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=2, m=3, k=2, n=4, dtype="float"
+        )
+        # a larger epilogue alignment 4
+        self._fused_gemm_alignment_e2e_helper(
+            gemm_op=ops.gemm_rcr_bias_add(), input_n=4, m=3, k=2, n=4, dtype="float"
+        )
+
+        # restore old env
+        if target.in_ci_env():
+            if old_force_ci is None:
+                del os.environ["FORCE_PROFILE"]
+            else:
+                os.environ["FORCE_PROFILE"] = old_force_ci
+
     def _fused_layernorm_e2e_helper(
         self,
         m: int,
@@ -548,6 +765,7 @@ def _fused_layernorm_e2e_helper(
         batch_size: Optional[IntVar] = None,
         gamma_is_none: bool = False,
         beta_is_none: bool = False,
+        dtype: str = "float16",
     ):
         logging.info(
             f"_fused_layernorm_e2e: m={m}, n1={n1}, n2={n2}, cat_dim={cat_dim}, batch_size={batch_size}"
@@ -560,7 +778,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         # Construct one graph with 2 layernorms + 1 cat.
         X1 = Tensor(
             shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n1)]),
-            dtype="float16",
+            dtype=dtype,
             name="X1",
             is_input=True,
         )
@@ -569,7 +787,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             GAMMA1 = Tensor(
                 shape=[IntImm(n1)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma1",
                 is_input=True,
             )
@@ -578,13 +796,13 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             BETA1 = Tensor(
                 shape=[IntImm(n1)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta1",
                 is_input=True,
             )
         X2 = Tensor(
             shape=_maybe_add_batch_size_ait([IntImm(m), IntImm(n2)]),
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             is_input=True,
         )
@@ -593,7 +811,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             GAMMA2 = Tensor(
                 shape=[IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma2",
                 is_input=True,
             )
@@ -602,7 +820,7 @@ def _maybe_add_batch_size_ait(shape: List[IntVar]) -> List[IntVar]:
         else:
             BETA2 = Tensor(
                 shape=[IntImm(n2)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta2",
                 is_input=True,
             )
@@ -623,8 +841,10 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
             [X7],
             target,
             "./tmp",
-            "fused_layernorm",
+            f"fused_layernorm_{dtype}",
+            dll_name=f"test_{self._test_id}.so",
         ) as module:
+            self._test_id += 1
             # Verify the generated graph.
             sorted_graph = module.debug_sorted_graph
             num_tensors = 7
@@ -637,24 +857,24 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
             self.assertEqual(len(sorted_ops), 2)
 
             # Run PyTorch baseline.
-            x1_pt = torch.randn(_maybe_add_batch_size_pt([m, n1])).cuda().half()
+            x1_pt = get_random_torch_tensor(_maybe_add_batch_size_pt([m, n1]), dtype)
             if gamma_is_none:
                 gamma1_pt = None
             else:
-                gamma1_pt = torch.randn(n1).cuda().half()
+                gamma1_pt = get_random_torch_tensor([n1], dtype)
             if beta_is_none:
                 beta1_pt = None
             else:
-                beta1_pt = torch.randn(n1).cuda().half()
-            x2_pt = torch.randn(_maybe_add_batch_size_pt([m, n2])).cuda().half()
+                beta1_pt = get_random_torch_tensor([n1], dtype)
+            x2_pt = get_random_torch_tensor(_maybe_add_batch_size_pt([m, n2]), dtype)
             if gamma_is_none:
                 gamma2_pt = None
             else:
-                gamma2_pt = torch.randn(n2).cuda().half()
+                gamma2_pt = get_random_torch_tensor([n2], dtype)
             if beta_is_none:
                 beta2_pt = None
             else:
-                beta2_pt = torch.randn(n2).cuda().half()
+                beta2_pt = get_random_torch_tensor([n2], dtype)
 
             x3_pt = torch.nn.functional.layer_norm(
                 x1_pt, x1_pt.size()[-1:], gamma1_pt, beta1_pt
@@ -676,7 +896,7 @@ def _maybe_add_batch_size_pt(shape: List[int]) -> List[int]:
                 inputs.append(gamma2_pt)
             if not beta_is_none:
                 inputs.append(beta2_pt)
-            x7 = torch.empty(x7_pt.size()).cuda().half()
+            x7 = get_torch_empty_tensor(x7_pt.size(), dtype)
             module.run_with_tensors(inputs, [x7])
 
             # Do comparisons.
@@ -722,6 +942,40 @@ def test_layernorm(self):
             batch_size=IntVar([1, 10], name="batch_size"),
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_layernorm_float(self):
+        self._fused_layernorm_e2e_helper(
+            m=1024, n1=256, n2=256, cat_dim=1, dtype="float"
+        )
+        self._fused_layernorm_e2e_helper(m=1, n1=256, n2=256, cat_dim=0, dtype="float")
+        self._fused_layernorm_e2e_helper(
+            m=1024,
+            n1=256,
+            n2=256,
+            cat_dim=1,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float",
+        )
+        self._fused_layernorm_e2e_helper(m=2, n1=128, n2=5, cat_dim=1, dtype="float")
+        self._fused_layernorm_e2e_helper(
+            m=2,
+            n1=3,
+            n2=128,
+            cat_dim=1,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float",
+        )
+        self._fused_layernorm_e2e_helper(
+            m=1024,
+            n1=256,
+            n2=256,
+            cat_dim=1,
+            batch_size=IntVar([1, 10], name="batch_size"),
+            dtype="float",
+        )
+
     def _test_group_layernorm_sigmoid_mul_cat_fusion(
         self,
         input_shapes,
@@ -731,6 +985,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
         fuse_sigmoid_mul=True,
         use_group_ops=True,
         num_cat_ops=1,
+        dtype="float16",
     ):
         assert num_cat_ops in (1, 2), "Only supports testing with num_cat_ops in (1, 2)"
         testname = (
@@ -754,7 +1009,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                         IntImm(shape[0]),
                         IntImm(shape[1]),
                     ],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -764,7 +1019,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -775,7 +1030,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(shape[1])],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -850,12 +1105,16 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
             gammas_pt = []
             betas_pt = []
             for shape in input_shapes:
-                xs_pt.append(torch.randn(shape).cuda().half())
+                xs_pt.append(get_random_torch_tensor(shape, dtype))
                 gamma_pt = (
-                    None if gamma_is_none else torch.randn(shape[1]).cuda().half()
+                    None
+                    if gamma_is_none
+                    else get_random_torch_tensor([shape[1]], dtype)
                 )
                 gammas_pt.append(gamma_pt)
-                beta_pt = None if beta_is_none else torch.randn(shape[1]).cuda().half()
+                beta_pt = (
+                    None if beta_is_none else get_random_torch_tensor([shape[1]], dtype)
+                )
                 betas_pt.append(beta_pt)
 
             y0s_pt = []
@@ -888,7 +1147,7 @@ def _test_group_layernorm_sigmoid_mul_cat_fusion(
                     inputs[input_name_to_index[f"beta_{i}"]] = betas_pt[i]
             ys = []
             for y_pt in ys_pt:
-                ys.append(torch.empty(y_pt.size()).cuda().half())
+                ys.append(get_torch_empty_tensor(y_pt.size(), dtype))
             module.run_with_tensors(inputs, ys)
             for y_pt, y in zip(ys_pt, ys):
                 self.assertTrue(
@@ -961,10 +1220,106 @@ def test_group_layernorm_sigmoid_mul_cat_fusion(self):
                 use_group_ops=False,
             )
 
-    def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_group_layernorm_sigmoid_mul_cat_fusion_float(self):
+        for fuse_sigmoid_mul in (True, False):
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 4, 0, fuse_sigmoid_mul=fuse_sigmoid_mul, dtype="float"
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 64], [128, 256], [128, 125]],
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]],
+                0,
+                gamma_is_none=True,
+                beta_is_none=True,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                0,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                num_cat_ops=2,
+                dtype="float",
+            )
+            # test group layernorm fusion (horizontal fusion)
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+                num_cat_ops=2,
+                dtype="float",
+            )
+            self._test_group_layernorm_sigmoid_mul_cat_fusion(
+                [[128, 256]] * 6,
+                1,
+                fuse_sigmoid_mul=fuse_sigmoid_mul,
+                use_group_ops=False,
+                dtype="float",
+            )
+
+    def _bmm_parameters(self, bmm_op_name, B, M, N, K):
+        """
+        Return a dict of parameters used for constructing bmm ops
+        """
+        bmm_op_name = bmm_op_name[:7]
+        bmm_rcr_dict = {
+            "a_shape": [B, M, K],
+            "b_shape": [B, N, K],
+            "c_shape": [B, M, N],
+            "a_permute": None,
+            "b_permute": [0, 2, 1],
+        }
+        bmm_crr_dict = {
+            "a_shape": [B, K, M],
+            "b_shape": [B, K, N],
+            "c_shape": [B, M, N],
+            "a_permute": [0, 2, 1],
+            "b_permute": None,
+        }
+        bmm_ccr_dict = {
+            "a_shape": [B, K, M],
+            "b_shape": [B, N, K],
+            "c_shape": [B, M, N],
+            "a_permute": [0, 2, 1],
+            "b_permute": [0, 2, 1],
+        }
+        bmm_rrr_dict = {
+            "a_shape": [B, M, K],
+            "b_shape": [B, K, N],
+            "c_shape": [B, M, N],
+            "a_permute": None,
+            "b_permute": None,
+        }
+        bmm_permutes = {
+            "bmm_rcr": bmm_rcr_dict,
+            "bmm_crr": bmm_crr_dict,
+            "bmm_ccr": bmm_ccr_dict,
+            "bmm_rrr": bmm_rrr_dict,
+        }
+        return bmm_permutes.get(bmm_op_name)
+
+    def _test_bmm_xxx_cat_fusion(
+        self,
+        B,
+        M,
+        Ns,
+        Ks,
+        cat_dim,
+        bmm_op_maker,
+        test_name,
+        expected_num_tensors,
+        expected_num_ops,
+        dtype="float16",
+    ):
         n = len(Ns)
         Cs = []
-        dtype = "float16"
 
         Xs_pt = []
         Ys_pt = []
@@ -972,27 +1327,36 @@ def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
         for i in range(n):
             N = Ns[i]
             K = Ks[i]
+            bmm_op = bmm_op_maker()
+            bmm_params = self._bmm_parameters(bmm_op._attrs["op"], B, M, N, K)
+            x_shape = bmm_params["a_shape"]
+            y_shape = bmm_params["b_shape"]
             X = Tensor(
-                shape=[B, M, K],
+                shape=x_shape,
                 dtype=dtype,
                 name=f"X{i}",
                 is_input=True,
             )
             Y = Tensor(
-                shape=[B, N, K],
+                shape=y_shape,
                 dtype=dtype,
                 name=f"Y{i}",
                 is_input=True,
             )
-            if N > 1:
-                C = ops.bmm_rcr()(X, Y)
-            else:
-                C = ops.bmm_rcr_n1()(X, Y)
+            C = bmm_op(X, Y)
             Cs.append(C)
 
-            x = torch.randn(B, M, K).cuda().half()
-            y = torch.randn(B, N, K).cuda().half()
-            c = torch.bmm(x, y.permute([0, 2, 1]))
+            x = get_random_torch_tensor(x_shape, dtype)
+            bmm_x = x
+            x_permute = bmm_params["a_permute"]
+            if x_permute is not None:
+                bmm_x = x.permute(x_permute)
+            y = get_random_torch_tensor(y_shape, dtype)
+            bmm_y = y
+            y_permute = bmm_params["b_permute"]
+            if y_permute is not None:
+                bmm_y = y.permute(y_permute)
+            c = torch.bmm(bmm_x, bmm_y)
             Xs_pt.append(x)
             Ys_pt.append(y)
             Cs_pt.append(c)
@@ -1004,28 +1368,293 @@ def _test_bmm_cat_fusion(self, B, M, Ns, Ks, cat_dim, testname):
 
         # Gen module.
         target = detect_target()
-        with compile_model(Y, target, "./tmp", testname) as module:
+        with compile_model(Y, target, "./tmp", test_name) as module:
             input_name_to_index = module.get_input_name_to_index_map()
             inputs = [0 for i in range(2 * n)]
             for i in range(n):
                 inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
                 inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
+
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), expected_num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), expected_num_ops)
+
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
-    def test_bmm_cat_fusion(self):
-        self._test_bmm_cat_fusion(1, 8, [2, 2, 2], [4, 5, 32], 2, "test_bmm_cat_1")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 1, "test_bmm_cat_2")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], 2, "test_bmm_cat_3")
-        self._test_bmm_cat_fusion(1, 16, [1, 1, 1], [32, 16, 32], -1, "test_bmm_cat_4")
+    def test_bmm_rcr_cat_fusion(self):
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 2, 2],
+            Ks=[4, 5, 32],
+            cat_dim=2,
+            bmm_op_maker=ops.bmm_rcr,
+            test_name="test_bmm_rcr_cat_1",
+            expected_num_tensors=11,
+            expected_num_ops=5,
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
+            cat_dim=1,
+            test_name="test_bmm_rcr_cat_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_3",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
+            cat_dim=-1,
+            test_name="test_bmm_rcr_cat_4",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def test_bmm_crr_cat_fusion(self):
+        # [B, K, M] x [B, K, N] = [B, M, N]
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 5, 32],
+            bmm_op_maker=ops.bmm_crr,
+            cat_dim=2,
+            test_name="test_bmm_crr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=8,
+            M=16,
+            Ns=[4, 4, 4],
+            Ks=[3, 16, 9],
+            bmm_op_maker=ops.bmm_crr,
+            cat_dim=1,
+            test_name="test_bmm_crr_cat_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def test_bmm_ccr_cat_fusion(self):
+        # [B, K, M] x [B, N, K] = [B, M, N]
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 8, 14],
+            bmm_op_maker=ops.bmm_ccr,
+            cat_dim=2,
+            test_name="test_bmm_ccr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def test_bmm_rrr_cat_fusion(self):
+        # [B, M, K] x [B, K, N] = [B, M, N]
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 4, 10],
+            Ks=[4, 8, 14],
+            bmm_op_maker=ops.bmm_rrr,
+            cat_dim=2,
+            test_name="test_bmm_rrr_cat_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+        )
+
+    def _test_bmm_xxx_add_cat_fusion(
+        self,
+        B,
+        M,
+        Ns,
+        Ks,
+        bmm_op_maker,
+        cat_dim,
+        test_name,
+        expected_num_tensors,
+        expected_num_ops,
+        dtype="float16",
+    ):
+        n = len(Ns)
+        Cs = []
+
+        Xs_pt = []
+        Ys_pt = []
+        Ds_pt = []
+        Cs_pt = []
+        for i in range(n):
+            N = Ns[i]
+            K = Ks[i]
+            bmm_op = bmm_op_maker()
+            bmm_params = self._bmm_parameters(bmm_op._attrs["op"], B, M, N, K)
+            x_shape = bmm_params["a_shape"]
+            y_shape = bmm_params["b_shape"]
+            d_shape = bmm_params["c_shape"]
+            X = Tensor(
+                shape=x_shape,
+                dtype=dtype,
+                name=f"X{i}",
+                is_input=True,
+            )
+            Y = Tensor(
+                shape=y_shape,
+                dtype=dtype,
+                name=f"Y{i}",
+                is_input=True,
+            )
+            D = Tensor(
+                shape=d_shape,
+                dtype=dtype,
+                name=f"D{i}",
+                is_input=True,
+            )
+            C = bmm_op(X, Y, D)
+            Cs.append(C)
+
+            x = get_random_torch_tensor(x_shape, dtype)
+            y = get_random_torch_tensor(y_shape, dtype)
+            d = get_random_torch_tensor(d_shape, dtype)
+            bmm_x = x
+            x_permute = bmm_params["a_permute"]
+            if x_permute is not None:
+                bmm_x = x.permute(x_permute)
+            y = get_random_torch_tensor(y_shape, dtype)
+            bmm_y = y
+            y_permute = bmm_params["b_permute"]
+            if y_permute is not None:
+                bmm_y = y.permute(y_permute)
+            c = torch.bmm(bmm_x, bmm_y)
+            c = c + d
+            Xs_pt.append(x)
+            Ys_pt.append(y)
+            Ds_pt.append(d)
+            Cs_pt.append(c)
+
+        Y = ops.concatenate()(Cs, dim=cat_dim)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+        y_pt = torch.cat(Cs_pt, dim=cat_dim)
+
+        # Gen module.
+        target = detect_target()
+        with compile_model(Y, target, "./tmp", test_name) as module:
+            input_name_to_index = module.get_input_name_to_index_map()
+            inputs = [0 for i in range(3 * n)]
+            for i in range(n):
+                inputs[input_name_to_index[f"X{i}"]] = Xs_pt[i]
+                inputs[input_name_to_index[f"Y{i}"]] = Ys_pt[i]
+                inputs[input_name_to_index[f"D{i}"]] = Ds_pt[i]
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            module.run_with_tensors(inputs, [y])
+
+            sorted_graph = module.debug_sorted_graph
+            self.assertEqual(len(sorted_graph), expected_num_tensors)
+            sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+            self.assertEqual(len(sorted_ops), expected_num_ops)
+
+            self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+
+    def test_bmm_crr_add_cat_fusion(self):
+        self._test_bmm_xxx_add_cat_fusion(
+            B=7,
+            M=10,
+            Ns=[2, 12, 8],
+            Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
+            cat_dim=2,
+            test_name="test_bmm_crr_add_cat_1",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+        )
+        self._test_bmm_xxx_add_cat_fusion(
+            B=8,
+            M=4,
+            Ns=[10, 10, 10],
+            Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
+            cat_dim=1,
+            test_name="test_bmm_crr_add_cat_2",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_cat_fusion_fp32_sm80(self):
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=8,
+            Ns=[2, 2, 2],
+            Ks=[4, 5, 32],
+            bmm_op_maker=ops.bmm_rcr,
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_float_1",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=1,
+            M=16,
+            Ns=[1, 1, 1],
+            Ks=[32, 16, 32],
+            bmm_op_maker=ops.bmm_rcr_n1,
+            cat_dim=2,
+            test_name="test_bmm_rcr_cat_float_3",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_xxx_cat_fusion(
+            B=8,
+            M=16,
+            Ns=[4, 4, 4],
+            Ks=[3, 16, 9],
+            bmm_op_maker=ops.bmm_crr,
+            cat_dim=1,
+            test_name="test_bmm_crr_cat_float_2",
+            expected_num_tensors=7,
+            expected_num_ops=3,
+            dtype="float",
+        )
+        self._test_bmm_xxx_add_cat_fusion(
+            B=7,
+            M=10,
+            Ns=[2, 12, 8],
+            Ks=[4, 5, 6],
+            bmm_op_maker=ops.bmm_crr_add,
+            cat_dim=2,
+            test_name="test_bmm_crr_add_cat_float_1",
+            expected_num_tensors=10,
+            expected_num_ops=3,
+            dtype="float",
+        )
 
     def _test_bmm_rcr_update_epilogue_alignment(
-        self, bmm_op, input_N, B, M, N, K, testname
+        self, bmm_op, input_N, B, M, N, K, testname, dtype="float16"
     ):
         # create a graph with 1 input + 1 bmm + 1 concat
         cat_dim = -1
-        dtype = "float16"
 
         bmm_op_kind = bmm_op._attrs["op"]
         Input1 = Tensor(
@@ -1058,7 +1687,7 @@ def _test_bmm_rcr_update_epilogue_alignment(
             num_inputs += 1
             X2 = Tensor(
                 shape=[IntImm(B), IntImm(M), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="X2",
                 is_input=True,
             )
@@ -1066,11 +1695,11 @@ def _test_bmm_rcr_update_epilogue_alignment(
         else:
             C = bmm_op(X, W)
 
-        input1_pt = torch.randn(B, M, input_N).cuda().half()
-        x_pt = torch.randn(B, M, K).cuda().half()
-        w_pt = torch.randn(*w_shape).cuda().half()
+        input1_pt = get_random_torch_tensor([B, M, input_N], dtype)
+        x_pt = get_random_torch_tensor([B, M, K], dtype)
+        w_pt = get_random_torch_tensor(w_shape, dtype)
         if num_inputs == 4:
-            x2_pt = torch.randn(B, M, N).cuda().half()
+            x2_pt = get_random_torch_tensor([B, M, N], dtype)
 
         if "rcr" in bmm_op_kind:
             c_pt = torch.bmm(x_pt, w_pt.permute([0, 2, 1]))
@@ -1096,12 +1725,12 @@ def _test_bmm_rcr_update_epilogue_alignment(
         inputs[input_name_to_index["W"]] = w_pt
         if num_inputs == 4:
             inputs[input_name_to_index["X2"]] = x2_pt
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
     # Test to ensure we update epilogue alignment values
-    def test_bmm_rcr_update_epilogue_alignment(self):
+    def _test_bmm_rcr_update_epilogue_alignment_common(self, dtype="float16"):
         # Note that we have to force profiling in ci. Otherwise, we would not
         # be able to fetch cached config.
         target = detect_target()
@@ -1118,6 +1747,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=5,
             K=8,
             testname="test_bmm_rcr_epilogue_3",
+            dtype=dtype,
         )
         # a larger epilogue value 4
         self._test_bmm_rcr_update_epilogue_alignment(
@@ -1128,6 +1758,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=5,
             K=8,
             testname="test_bmm_rcr_epilogue_4",
+            dtype=dtype,
         )
 
         # a smaller epilogue value 2
@@ -1139,6 +1770,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=4,
             K=8,
             testname="test_bmm_rcr_epilogue_1",
+            dtype=dtype,
         )
         # a larger epilogue value 4
         self._test_bmm_rcr_update_epilogue_alignment(
@@ -1149,6 +1781,7 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             N=4,
             K=8,
             testname="test_bmm_rcr_epilogue_2",
+            dtype=dtype,
         )
 
         # restore old env
@@ -1158,6 +1791,13 @@ def test_bmm_rcr_update_epilogue_alignment(self):
             else:
                 os.environ["FORCE_PROFILE"] = old_force_ci
 
+    def test_bmm_rcr_update_epilogue_alignment(self):
+        self._test_bmm_rcr_update_epilogue_alignment_common()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_bmm_rcr_update_epilogue_alignment_fp32_sm80(self):
+        self._test_bmm_rcr_update_epilogue_alignment_common(dtype="float")
+
     def _test_reduce_cat_fusion_1(
         self,
         input_shape,
@@ -1166,7 +1806,7 @@ def _test_reduce_cat_fusion_1(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1175,7 +1815,7 @@ def _test_reduce_cat_fusion_1(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1186,7 +1826,7 @@ def _test_reduce_cat_fusion_1(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_op(X1)
@@ -1210,8 +1850,8 @@ def _test_reduce_cat_fusion_1(
                 np.testing.assert_equal(Y_src_ops[0], reduce_op)
             np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y_pt = torch.cat([Y1_pt, X2_pt], dim=cat_dim)
 
@@ -1265,7 +1905,7 @@ def _test_reduce_cat_fusion_2(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1274,7 +1914,7 @@ def _test_reduce_cat_fusion_2(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1285,7 +1925,7 @@ def _test_reduce_cat_fusion_2(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_mean_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_mean_op(X1)
@@ -1296,7 +1936,7 @@ def _test_reduce_cat_fusion_2(
         Y3 = ops.concatenate()([X2, Y1, Y2], dim=cat_dim)
 
         x3_shape = [d._attrs["values"][0] for d in Y3._attrs["shape"]]
-        X3 = Tensor(shape=x3_shape, dtype=input_type, name="input_3", is_input=True)
+        X3 = Tensor(shape=x3_shape, dtype=dtype, name="input_3", is_input=True)
 
         add_op = ops.elementwise(FuncEnum.ADD)
         Y = add_op(Y3, X3)
@@ -1329,9 +1969,9 @@ def _test_reduce_cat_fusion_2(
                 concat_op._attrs["input_masks"], [True, False, False]
             )
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
-            X3_pt = get_random_torch_tensor(x3_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
+            X3_pt = get_random_torch_tensor(x3_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y2_pt = torch.var(X1_pt, dim=reduction_dim, unbiased=True, keepdim=keepdim)
             Y3_pt = torch.cat([X2_pt, Y1_pt, Y2_pt], dim=cat_dim)
@@ -1393,7 +2033,7 @@ def _test_reduce_cat_fusion_3(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1402,7 +2042,7 @@ def _test_reduce_cat_fusion_3(
         )
         target = detect_target()
 
-        X1 = Tensor(shape=input_shape, dtype=input_type, name="input_1", is_input=True)
+        X1 = Tensor(shape=input_shape, dtype=dtype, name="input_1", is_input=True)
 
         x2_shape = []
         for idx in range(len(input_shape)):
@@ -1413,7 +2053,7 @@ def _test_reduce_cat_fusion_3(
                 x2_shape.append(input_shape[idx])
         # set concat_dim to a new value for testing
         x2_shape[cat_dim] = new_cat_dim_val
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="input_2", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="input_2", is_input=True)
 
         reduce_op = ops.reduce_mean(reduction_dim, keepdim=keepdim, dtype=None)
         Y1 = reduce_op(X1)
@@ -1439,8 +2079,8 @@ def _test_reduce_cat_fusion_3(
                 concat_op._attrs["input_masks"], [True, False, True]
             )
 
-            X1_pt = get_random_torch_tensor(input_shape, input_type)
-            X2_pt = get_random_torch_tensor(x2_shape, input_type)
+            X1_pt = get_random_torch_tensor(input_shape, dtype)
+            X2_pt = get_random_torch_tensor(x2_shape, dtype)
             Y1_pt = torch.mean(X1_pt, dim=reduction_dim, keepdim=keepdim)
             Y_pt = torch.cat([X2_pt, Y1_pt, X2_pt], dim=cat_dim)
 
@@ -1479,7 +2119,7 @@ def _test_reduce_cat_fusion_batch(
         cat_dim,
         new_cat_dim_val,
         test_name,
-        input_type="float16",
+        dtype="float16",
     ):
         torch.manual_seed(0)
         logging.info(
@@ -1494,7 +2134,7 @@ def _test_reduce_cat_fusion_batch(
 
         X1 = Tensor(
             shape=[batch_dim, *input_shape],
-            dtype=input_type,
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -1513,7 +2153,7 @@ def _test_reduce_cat_fusion_batch(
         x2_shape[cat_dim - 1] = new_cat_dim_val
         X2 = Tensor(
             shape=[batch_dim, *x2_shape],
-            dtype=input_type,
+            dtype=dtype,
             name="input_2",
             is_input=True,
         )
@@ -1542,8 +2182,8 @@ def _test_reduce_cat_fusion_batch(
             np.testing.assert_equal(concat_op._attrs["input_masks"], [False, True])
 
             for batch in batch_sizes:
-                X1_pt = get_random_torch_tensor([batch, *input_shape], input_type)
-                X2_pt = get_random_torch_tensor([batch, *x2_shape], input_type)
+                X1_pt = get_random_torch_tensor([batch, *input_shape], dtype)
+                X2_pt = get_random_torch_tensor([batch, *x2_shape], dtype)
                 Y1_pt = torch.linalg.vector_norm(
                     X1_pt, ord=ord_kind, dim=reduction_dim, keepdim=keepdim
                 )
@@ -1564,21 +2204,20 @@ def test_reduce_cat_fusion_batch(self):
             keepdim=True,
             cat_dim=2,
             new_cat_dim_val=5,
-            test_name="test_reduce_cat_1_0",
+            test_name="test_reduce_cat_fusion_batch",
         )
 
-    def test_col_reduce_cat_fusion(self):
+    def _test_col_reduce_cat_fusion(self, dtype="float16"):
         torch.manual_seed(0)
         input_a_shape = [1, 4096]
         input_b_shape = [1, 250, 256]
-        input_type = "float16"
         reduction_dim = 1
         cat_dim = -1
-        test_name = "test_col_reduce_sum_cat"
+        test_name = f"test_col_reduce_sum_cat_{dtype}"
 
         target = detect_target()
-        A = Tensor(shape=input_a_shape, dtype=input_type, name="input_a", is_input=True)
-        B = Tensor(shape=input_b_shape, dtype=input_type, name="input_b", is_input=True)
+        A = Tensor(shape=input_a_shape, dtype=dtype, name="input_a", is_input=True)
+        B = Tensor(shape=input_b_shape, dtype=dtype, name="input_b", is_input=True)
 
         X = ops.reduce_sum(dim=reduction_dim)(B)
         Y = ops.concatenate()([A, X], dim=cat_dim)
@@ -1592,31 +2231,33 @@ def test_col_reduce_cat_fusion(self):
         concat_op = sorted_ops[1]
         np.testing.assert_equal(concat_op._attrs["input_masks"], [True, True])
 
-        a_pt = get_random_torch_tensor(input_a_shape, input_type)
-        b_pt = get_random_torch_tensor(input_b_shape, input_type)
+        a_pt = get_random_torch_tensor(input_a_shape, dtype)
+        b_pt = get_random_torch_tensor(input_b_shape, dtype)
         x_pt = torch.sum(b_pt, dim=reduction_dim)
         y_pt = torch.cat([a_pt, x_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"input_a": a_pt, "input_b": b_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
-    def test_strided_op_multiple_cats(self):
+    def test_col_reduce_cat_fusion(self):
+        self._test_col_reduce_cat_fusion()
+
+    def _test_strided_op_multiple_cats(self, dtype="float16"):
         # y1 = concat(x0, x1) # [4, 30]
         # y2 = slice(y1) # [4, 6]
         # y = concat(y1, y2) # [4, 36]
         x0_shape = [4, 10]
         x1_shape = [4, 20]
-        input_type = "float16"
         cat_dim = 1
-        test_name = "test_strided_op_multiple_cats"
+        test_name = f"test_strided_op_multiple_cats_{dtype}"
 
         target = detect_target()
-        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
-        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
+        X0 = Tensor(shape=x0_shape, dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=dtype, name="x1", is_input=True)
 
         Y1 = ops.concatenate()([X0, X1], dim=cat_dim)
         slice_start_indices = [0, 0]
@@ -1631,8 +2272,8 @@ def test_strided_op_multiple_cats(self):
 
         module = compile_model(Y, target, "./tmp", test_name)
 
-        x0_pt = get_random_torch_tensor(x0_shape, input_type)
-        x1_pt = get_random_torch_tensor(x1_shape, input_type)
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        x1_pt = get_random_torch_tensor(x1_shape, dtype)
         y1_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -1640,14 +2281,14 @@ def test_strided_op_multiple_cats(self):
         y2_pt = y1_pt[slice_indices]
         y_pt = torch.cat([y1_pt, y2_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"x0": x0_pt, "x1": x1_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
-    def test_strided_op_multiple_cats_2(self):
+    def _test_strided_op_multiple_cats_2(self, dtype="float16"):
         # y1 = x0 + x1
         # y2 = slice(y1)
         # y3 = concat(x2, y2)
@@ -1655,14 +2296,13 @@ def test_strided_op_multiple_cats_2(self):
         x0_shape = [4, 10]
         x1_shape = [4, 10]
         x2_shape = [4, 20]
-        input_type = "float16"
         cat_dim = 1
-        test_name = "test_strided_op_multiple_cats_2"
+        test_name = f"test_strided_op_multiple_cats_2_{dtype}"
 
         target = detect_target()
-        X0 = Tensor(shape=x0_shape, dtype=input_type, name="x0", is_input=True)
-        X1 = Tensor(shape=x1_shape, dtype=input_type, name="x1", is_input=True)
-        X2 = Tensor(shape=x2_shape, dtype=input_type, name="x2", is_input=True)
+        X0 = Tensor(shape=x0_shape, dtype=dtype, name="x0", is_input=True)
+        X1 = Tensor(shape=x1_shape, dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=x2_shape, dtype=dtype, name="x2", is_input=True)
 
         Y1 = ops.elementwise(FuncEnum.ADD)(X0, X1)
         slice_start_indices = [0, 0]
@@ -1682,9 +2322,9 @@ def test_strided_op_multiple_cats_2(self):
         self.assertEqual(len(sorted_ops), 2)
         self.assertEqual(sorted_ops[1]._attrs["op"], "concatenate")
 
-        x0_pt = get_random_torch_tensor(x0_shape, input_type)
-        x1_pt = get_random_torch_tensor(x1_shape, input_type)
-        x2_pt = get_random_torch_tensor(x2_shape, input_type)
+        x0_pt = get_random_torch_tensor(x0_shape, dtype)
+        x1_pt = get_random_torch_tensor(x1_shape, dtype)
+        x2_pt = get_random_torch_tensor(x2_shape, dtype)
         y1_pt = x0_pt + x1_pt
         slice_indices = [
             slice(i, j) for i, j in zip(slice_start_indices, slice_end_indices)
@@ -1693,13 +2333,62 @@ def test_strided_op_multiple_cats_2(self):
         y3_pt = torch.cat([x2_pt, y2_pt], dim=cat_dim)
         y_pt = torch.cat([y3_pt, y3_pt], dim=cat_dim)
 
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         inputs = {"x0": x0_pt, "x1": x1_pt, "x2": x2_pt}
         module.run_with_tensors(inputs, [y])
         y_pt = y_pt.cpu().numpy()
 
         torch.testing.assert_close(y_pt, y.cpu().numpy(), atol=0.05, rtol=0.05)
 
+    def test_strided_op_multiple_cats(self):
+        self._test_strided_op_multiple_cats()
+        self._test_strided_op_multiple_cats_2()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_reduce_cat_float(self):
+        self._test_reduce_cat_fusion_1(
+            input_shape=[4, 2],
+            reduction_dim=1,
+            keepdim=True,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_1_0_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_2(
+            input_shape=[10, 22, 16],
+            reduction_dim=1,
+            keepdim=False,
+            cat_dim=1,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_2_1_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_3(
+            input_shape=[3, 11, 16],
+            reduction_dim=2,
+            keepdim=False,
+            cat_dim=0,
+            new_cat_dim_val=10,
+            test_name="test_reduce_cat_3_1_float",
+            dtype="float",
+        )
+        self._test_reduce_cat_fusion_batch(
+            batch_sizes=[5, 20],
+            input_shape=[4, 2],
+            reduction_dim=2,
+            keepdim=True,
+            cat_dim=2,
+            new_cat_dim_val=5,
+            test_name="test_reduce_cat_fusion_batch_float",
+            dtype="float",
+        )
+        self._test_col_reduce_cat_fusion(dtype="float")
+        self._test_strided_op_multiple_cats(dtype="float")
+        self._test_strided_op_multiple_cats_2(dtype="float")
+
+
+filter_test_cases_by_test_env(StridedOpCatPatternTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_reshape_cat.py b/tests/unittest/compiler/test_strided_reshape_cat.py
index 345cd3d2b..9027035a4 100644
--- a/tests/unittest/compiler/test_strided_reshape_cat.py
+++ b/tests/unittest/compiler/test_strided_reshape_cat.py
@@ -21,7 +21,14 @@
 from aitemplate.compiler.stable_set import StableSet
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedReshapeCatTestCase(unittest.TestCase):
@@ -29,10 +36,10 @@ def __init__(self, *args, **kwargs):
         super(StridedReshapeCatTestCase, self).__init__(*args, **kwargs)
         self.test_count = 1
 
-    def _test_strided_reshape_cat(self, num_cat_ops=1):
+    def _test_strided_reshape_cat(self, num_cat_ops=1, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = 128
@@ -54,21 +61,21 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
 
         X3 = Tensor(
-            shape=[IntImm(M3), IntImm(K3)], dtype="float16", name="x3", is_input=True
+            shape=[IntImm(M3), IntImm(K3)], dtype=dtype, name="x3", is_input=True
         )
-        W3 = Tensor(shape=[N3, K3], dtype="float16", name="w3", is_input=True)
+        W3 = Tensor(shape=[N3, K3], dtype=dtype, name="w3", is_input=True)
 
         Input = Tensor(
-            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+            shape=[BS, Input_M, Input_N], dtype=dtype, name="input", is_input=True
         )
 
         group_gemm_op = ops.group_gemm_rcr()
@@ -107,21 +114,29 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
                 concat_op._attrs["input_masks"], [False, False, True, False]
             )
         else:
-            np.testing.assert_equal(concat_op_1._attrs["input_masks"], [False, False])
-            np.testing.assert_equal(concat_op_2._attrs["input_masks"], [True, False])
+            Y_src_ops = list(Y_src_ops)
+            np.testing.assert_equal(len(Y_src_ops), 2)
+            concat_op = (
+                Y_src_ops[0]
+                if Y_src_ops[0]._attrs["op"] == "concatenate"
+                else Y_src_ops[1]
+            )
+            np.testing.assert_equal(
+                concat_op._attrs["input_masks"], [False, False, True, False]
+            )
 
         expected_inputs_group_gemm_op = [X1, W1, X2, W2, X3, W3]
         np.testing.assert_equal(
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        X3_pt = torch.randn(M3, K3).cuda().half()
-        W3_pt = torch.randn(N3, K3).cuda().half()
-        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        X3_pt = get_random_torch_tensor([M3, K3], dtype)
+        W3_pt = get_random_torch_tensor([N3, K3], dtype)
+        Input_pt = get_random_torch_tensor([BS, Input_M, Input_N], dtype)
         Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y3_orig_pt = torch.nn.functional.linear(X3_pt, W3_pt)
@@ -131,7 +146,7 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
         Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt, Y3_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -144,19 +159,15 @@ def _test_strided_reshape_cat(self, num_cat_ops=1):
             "input": Input_pt,
         }
 
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
         self.test_count += 1
 
-    def test_strided_reshape_cat(self, num_cat_ops=1):
-        self._test_strided_reshape_cat(1)
-        self._test_strided_reshape_cat(2)
-
-    def test_strided_reshape_cat_bias(self):
+    def _test_strided_reshape_cat_bias(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         M1 = 128
@@ -174,18 +185,18 @@ def test_strided_reshape_cat_bias(self):
         dim = 1
 
         X1 = Tensor(
-            shape=[IntImm(M1), IntImm(K1)], dtype="float16", name="x1", is_input=True
+            shape=[IntImm(M1), IntImm(K1)], dtype=dtype, name="x1", is_input=True
         )
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
         X2 = Tensor(
-            shape=[IntImm(M2), IntImm(K2)], dtype="float16", name="x2", is_input=True
+            shape=[IntImm(M2), IntImm(K2)], dtype=dtype, name="x2", is_input=True
         )
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
 
         Input = Tensor(
-            shape=[BS, Input_M, Input_N], dtype="float16", name="input", is_input=True
+            shape=[BS, Input_M, Input_N], dtype=dtype, name="input", is_input=True
         )
 
         group_gemm_op = ops.group_gemm_rcr_bias()
@@ -211,13 +222,13 @@ def test_strided_reshape_cat_bias(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X1_pt = torch.randn(M1, K1).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        X2_pt = torch.randn(M2, K2).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
-        Input_pt = torch.randn(BS, Input_M, Input_N).cuda().half()
+        X1_pt = get_random_torch_tensor([M1, K1], dtype)
+        W1_pt = get_random_torch_tensor([N1, K1], dtype)
+        B1_pt = get_random_torch_tensor([N1], dtype)
+        X2_pt = get_random_torch_tensor([M2, K2], dtype)
+        W2_pt = get_random_torch_tensor([N2, K2], dtype)
+        B2_pt = get_random_torch_tensor([N2], dtype)
+        Input_pt = get_random_torch_tensor([BS, Input_M, Input_N], dtype)
         Y1_orig_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_orig_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y1_pt = torch.reshape(Y1_orig_pt, [BS, -1, Input_N])
@@ -225,7 +236,7 @@ def test_strided_reshape_cat_bias(self):
         Y_pt = torch.cat([Y1_pt, Y2_pt, Input_pt], dim=dim)
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
         inputs = {
@@ -237,11 +248,23 @@ def test_strided_reshape_cat_bias(self):
             "b2": B2_pt,
             "input": Input_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
         self.test_count += 1
 
+    def test_strided_reshape_cat(self):
+        self._test_strided_reshape_cat()
+        self._test_strided_reshape_cat(num_cat_ops=2)
+        self._test_strided_reshape_cat_bias()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_reshape_cat_fp32_sm80(self):
+        self._test_strided_reshape_cat(num_cat_ops=2, dtype="float")
+        self._test_strided_reshape_cat_bias(dtype="float")
+
+
+filter_test_cases_by_test_env(StridedReshapeCatTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_scatter.py b/tests/unittest/compiler/test_strided_scatter.py
index f98aa5211..9b7284d1f 100644
--- a/tests/unittest/compiler/test_strided_scatter.py
+++ b/tests/unittest/compiler/test_strided_scatter.py
@@ -20,11 +20,17 @@
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import graph_utils, shape_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils
+from aitemplate.utils.shape_utils import gen_int_var_min_max as gen_IntVar
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedScatterTestCase(unittest.TestCase):
@@ -38,10 +44,7 @@ def _make_tensor(
         input_name,
         input_type="float16",
     ):
-        x_shape = [
-            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
-            for d in input_shape
-        ]
+        x_shape = [d if isinstance(d, IntVar) else IntImm(d) for d in input_shape]
         X = Tensor(shape=x_shape, dtype=input_type, name=input_name, is_input=True)
         return X
 
@@ -49,8 +52,7 @@ def _make_add(
         self, input_shape, input_0_name, input_1_name, output_name, input_type="float16"
     ):
         input_add_shape = [
-            shape_utils.gen_int_var_min_max(d) if isinstance(d, list) else IntImm(d)
-            for d in input_shape
+            d if isinstance(d, IntVar) else IntImm(d) for d in input_shape
         ]
         input_Add_0 = Tensor(
             shape=input_add_shape,
@@ -85,7 +87,7 @@ def _make_slice_ops(
                 X = input_tensor
             else:
                 X_name = f"input_{idx}"
-                X = self._make_tensor(input_shape, X_name)
+                X = self._make_tensor(input_shape, X_name, input_type)
             Y = slice_op(X, start_indices=s_indices, end_indices=e_indices)
             Ys.append(Y)
         return Ys
@@ -98,8 +100,9 @@ def _test_strided_scatter_basic(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_basic with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
@@ -110,6 +113,7 @@ def _test_strided_scatter_basic(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -135,7 +139,7 @@ def _test_strided_scatter_basic(
         for input_shape, s_indices, e_indices in zip(
             input_shapes, start_indices, end_indices
         ):
-            x_pt = torch.randn(input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             xs_pt.append(x_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             slice_output_pt = x_pt[slice_indices]
@@ -147,7 +151,7 @@ def _test_strided_scatter_basic(
         inputs = [0 for i in range(len(xs_pt))]
         for i, x_pt in enumerate(xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = x_pt
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -190,8 +194,9 @@ def _test_strided_scatter_dynamic(
         scatter_dim,
         test_name,
         make_slices=None,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_dynamic with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
@@ -207,12 +212,15 @@ def _test_strided_scatter_dynamic(
             ):
                 if not make_slice:
                     input_name = f"input_{idx}"
-                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+                    input_tensors[idx] = self._make_tensor(
+                        input_shape, input_name, dtype
+                    )
         slice_outputs = self._make_slice_ops(
             input_shapes,
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -233,8 +241,8 @@ def _test_strided_scatter_dynamic(
         self.assertEqual(fused_op._attrs["op"], "concatenate")
 
         for d in input_shapes[0]:
-            if isinstance(d, list):
-                Ms = d
+            if isinstance(d, IntVar):
+                Ms = d._attrs["values"]
                 break
         assert Ms is not None, "expected to have at least one dynamic dim"
         for idx in range(len(Ms)):
@@ -245,9 +253,10 @@ def _test_strided_scatter_dynamic(
                 input_shapes, start_indices, end_indices
             ):
                 input_shape_pt = [
-                    d[idx] if isinstance(d, list) else d for d in input_shape
+                    d._attrs["values"][idx] if isinstance(d, IntVar) else d
+                    for d in input_shape
                 ]
-                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
                 slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
                 slice_output_pt = x_pt[slice_indices]
@@ -259,21 +268,26 @@ def _test_strided_scatter_dynamic(
             inputs = [0 for i in range(len(xs_pt))]
             for i, x_pt in enumerate(xs_pt):
                 inputs[input_name_to_index[f"input_{i}"]] = x_pt
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
     def test_strided_scatter_dynamic(self):
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
             test_name="strided_scatter_dynamic",
         )
+        dynamic_dim_2 = gen_IntVar([10, 20], name="dynamic_dim_2")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], [10, 20], 4], [[5, 16], [10, 20], 10]),
+            input_shapes=(
+                [dynamic_dim_1, dynamic_dim_2, 4],
+                [dynamic_dim_1, dynamic_dim_2, 10],
+            ),
             start_indices=([0, 0, 2], [0, 0, 2]),
             end_indices=([None, None, 4], [None, None, 10]),
             scatter_dim=2,
@@ -281,19 +295,22 @@ def test_strided_scatter_dynamic(self):
         )
 
     def test_strided_scatter_partial(self):
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
         self._test_strided_scatter_dynamic(
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
             test_name="strided_scatter_partial",
             make_slices=[True, False],
         )
+        dynamic_dim_2 = gen_IntVar([5, 7], name="dynamic_dim_2")
+        dynamic_dim_3 = gen_IntVar([1, 10], name="dynamic_dim_3")
         self._test_strided_scatter_dynamic(
             input_shapes=(
-                [[5, 7], [1, 10], 4],
-                [[5, 7], [1, 10], 6],
-                [[5, 7], [1, 10], 8],
+                [dynamic_dim_2, dynamic_dim_3, 4],
+                [dynamic_dim_2, dynamic_dim_3, 6],
+                [dynamic_dim_2, dynamic_dim_3, 8],
             ),
             start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
             end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
@@ -303,14 +320,14 @@ def test_strided_scatter_partial(self):
         )
         self._test_strided_scatter_dynamic(
             input_shapes=(
-                [[5, 7], [1, 10], 4],
-                [[5, 7], [1, 10], 6],
-                [[5, 7], [1, 10], 8],
+                [dynamic_dim_2, dynamic_dim_3, 4],
+                [dynamic_dim_2, dynamic_dim_3, 6],
+                [dynamic_dim_2, dynamic_dim_3, 8],
             ),
             start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
             end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
             scatter_dim=2,
-            test_name="strided_scatter_partial",
+            test_name="strided_scatter_partial_1",
             make_slices=[False, False, True],
         )
 
@@ -321,6 +338,7 @@ def _make_test_graph_multi_dsts_2(
         start_indices,
         end_indices,
         scatter_dim,
+        dtype="float16",
     ):
         """Make a graph where (1) a tensor is sliced twice and both slices are
         fed into the same concat op, and (2) another sliced output (i.e not
@@ -332,7 +350,7 @@ def _make_test_graph_multi_dsts_2(
             input_tensors,
             start_indices,
             end_indices,
-            scatter_dim,
+            dtype,
         )
         slice_op_0 = list(Ys[0].src_ops())[0]
         X0 = slice_op_0._attrs["inputs"][0]
@@ -359,8 +377,9 @@ def _test_strided_scatter_multi_dsts_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"strided_scatter_multi_dsts_2 with input_shapes: {input_shapes}, "
             f"start_indices: {start_indices}, end_indices: {end_indices}"
         )
@@ -371,7 +390,7 @@ def _test_strided_scatter_multi_dsts_2(
         for input_shape, s_indices, e_indices in zip(
             input_shapes, start_indices, end_indices
         ):
-            X_pt = torch.randn(input_shape).cuda().half()
+            X_pt = get_random_torch_tensor(input_shape, dtype)
             Xs_pt.append(X_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             Y_pt = X_pt[slice_indices]
@@ -388,7 +407,7 @@ def _test_strided_scatter_multi_dsts_2(
 
         input_tensors = [None] * len(input_shapes)
         Y = self._make_test_graph_multi_dsts_2(
-            input_shapes, input_tensors, start_indices, end_indices, scatter_dim
+            input_shapes, input_tensors, start_indices, end_indices, scatter_dim, dtype
         )
 
         test_name = "strided_scatter_multi_dsts_2"
@@ -406,7 +425,7 @@ def _test_strided_scatter_multi_dsts_2(
         inputs = [0 for i in range(len(Xs_pt))]
         for i, X_pt in enumerate(Xs_pt):
             inputs[input_name_to_index[f"input_{i}"]] = X_pt
-        y = torch.empty(Y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(Y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -417,7 +436,7 @@ def test_strided_scatter_multi_dsts_2(self):
             start_indices=[[0, 1, 0], [0, 1, 0]],
             end_indices=[[None, 2, None], [None, 7, None]],
             scatter_dim=1,
-            test_name="strided_scatter_partial",
+            test_name="strided_scatter_multi_dsts_2",
         )
 
     def _test_strided_scatter_input_masks(
@@ -431,26 +450,28 @@ def _test_strided_scatter_input_masks(
         scatter_dim,
         test_name,
         make_slices,
+        dtype="float16",
     ):
         # make a graph with 1 gemm_rcr_bias + 1 elemwise + multiple slices -> cat
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_input_masks with {input_shapes=}, "
             f"{start_indices=}, {end_indices=}"
         )
 
+        Ms_IntVar = gen_IntVar(Ms, name="Ms")
         input_A_name = "input_a"
-        input_A = self._make_tensor([list(Ms), K], input_A_name)
+        input_A = self._make_tensor([Ms_IntVar, K], input_A_name, dtype)
         input_B_name = "input_b"
-        input_B = self._make_tensor([N, K], input_B_name)
+        input_B = self._make_tensor([N, K], input_B_name, dtype)
         input_Bias_name = "input_bias"
-        input_Bias = self._make_tensor([N], input_Bias_name)
+        input_Bias = self._make_tensor([N], input_Bias_name, dtype)
         gemm_output = ops.gemm_rcr_bias()(input_A, input_B, input_Bias)
         gemm_output._attrs["name"] = "gemm_output"
 
         input_Add_0_name = "input_add_0"
         input_Add_1_name = "input_add_1"
         add_output = self._make_add(
-            [list(Ms), N], input_Add_0_name, input_Add_1_name, "add_output"
+            [Ms_IntVar, N], input_Add_0_name, input_Add_1_name, "add_output", dtype
         )
         # A, B, bias, add_0 and add_1
         num_extra_inputs = 5
@@ -466,12 +487,15 @@ def _test_strided_scatter_input_masks(
             ):
                 if not make_slice:
                     input_name = f"input_{idx}"
-                    input_tensors[idx] = self._make_tensor(input_shape, input_name)
+                    input_tensors[idx] = self._make_tensor(
+                        input_shape, input_name, dtype
+                    )
         slice_outputs = self._make_slice_ops(
             input_shapes,
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_inputs = [gemm_output] + slice_outputs + [add_output]
         concat_op = ops.concatenate()
@@ -501,13 +525,13 @@ def _test_strided_scatter_input_masks(
 
         for idx, M in enumerate(Ms):
             # Run PyTorch
-            a_pt = torch.randn(M, K).cuda().half()
-            b_pt = torch.randn(N, K).cuda().half()
-            bias_pt = torch.randn(N).cuda().half()
+            a_pt = get_random_torch_tensor([M, K], dtype)
+            b_pt = get_random_torch_tensor([N, K], dtype)
+            bias_pt = get_random_torch_tensor([N], dtype)
             gemm_output_pt = torch.nn.functional.linear(a_pt, b_pt, bias=bias_pt)
 
-            add_0_pt = torch.randn(M, N).cuda().half()
-            add_1_pt = torch.randn(M, N).cuda().half()
+            add_0_pt = get_random_torch_tensor([M, N], dtype)
+            add_1_pt = get_random_torch_tensor([M, N], dtype)
             add_output_pt = add_0_pt + add_1_pt
 
             slice_outputs_pt = []
@@ -516,9 +540,10 @@ def _test_strided_scatter_input_masks(
                 input_shapes, start_indices, end_indices
             ):
                 input_shape_pt = [
-                    d[idx] if isinstance(d, list) else d for d in input_shape
+                    d._attrs["values"][idx] if isinstance(d, IntVar) else d
+                    for d in input_shape
                 ]
-                x_pt = torch.randn(*input_shape_pt).cuda().half()
+                x_pt = get_random_torch_tensor(input_shape_pt, dtype)
                 xs_pt.append(x_pt)
                 slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
                 slice_output_pt = x_pt[slice_indices]
@@ -536,18 +561,20 @@ def _test_strided_scatter_input_masks(
             inputs[input_name_to_index["input_bias"]] = bias_pt
             inputs[input_name_to_index[input_Add_0_name]] = add_0_pt
             inputs[input_name_to_index[input_Add_1_name]] = add_1_pt
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
 
     def test_strided_scatter_input_masks(self):
         # gemm_output[Ms, N]
+        # This dynamic_dim_1 is actually the same as Ms..
+        dynamic_dim_1 = gen_IntVar([5, 16], name="Ms")
         self._test_strided_scatter_input_masks(
             Ms=(5, 16),
             N=4,
             K=10,
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 3], [None, 10]),
             scatter_dim=1,
@@ -558,7 +585,7 @@ def test_strided_scatter_input_masks(self):
             Ms=(5, 16),
             N=4,
             K=10,
-            input_shapes=([[5, 16], 5], [[5, 16], 10]),
+            input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
             start_indices=([0, 1], [0, 2]),
             end_indices=([None, 2], [None, 10]),
             scatter_dim=1,
@@ -575,13 +602,16 @@ def _test_strided_scatter_basic_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(f"test_strided_scatter_basic with {start_indices=}, {end_indices=}")
+        _LOGGER.info(
+            f"test_strided_scatter_basic with {start_indices=}, {end_indices=}"
+        )
 
         input_name_0 = "input_0"
-        input_0 = self._make_tensor(input_shape_0, input_name_0)
+        input_0 = self._make_tensor(input_shape_0, input_name_0, dtype)
         input_name_2 = "input_2"
-        input_2 = self._make_tensor(input_shape_2, input_name_2)
+        input_2 = self._make_tensor(input_shape_2, input_name_2, dtype)
 
         input_tensors = [input_2, input_0, input_2]
         input_shapes = [None] * len(input_tensors)
@@ -590,6 +620,7 @@ def _test_strided_scatter_basic_2(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -611,8 +642,8 @@ def _test_strided_scatter_basic_2(
 
         # Run PyTorch
         slice_outputs_pt = []
-        x0_pt = torch.randn(input_shape_0).cuda().half()
-        x2_pt = torch.randn(input_shape_2).cuda().half()
+        x0_pt = get_random_torch_tensor(input_shape_0, dtype)
+        x2_pt = get_random_torch_tensor(input_shape_2, dtype)
         xs_pt = [x2_pt, x0_pt, x2_pt]
         for x_pt, s_indices, e_indices in zip(xs_pt, start_indices, end_indices):
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
@@ -622,7 +653,7 @@ def _test_strided_scatter_basic_2(
 
         # run ait
         inputs = {"input_0": x0_pt, "input_2": x2_pt}
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -655,21 +686,33 @@ def _test_strided_scatter_input_masks_2(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
         # make a graph with 2 elemwise + 3 slices where 1 elemwise is sliced twice
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_input_masks {start_indices=}, {end_indices=}"
         )
 
         add_0_input_name_0 = "add_0_input_0"
         add_0_input_name_1 = "add_0_input_1"
+        Ms0_IntVar = gen_IntVar(Ms0, name="Ms0")
+        # This is not ideal, we should have "check" against the start/end/scatter_dim when we construct the test case instead.
+        Ms1_IntVar = Ms0_IntVar if Ms0 == Ms1 else gen_IntVar(Ms1, name="Ms1")
         add_output0 = self._make_add(
-            [list(Ms0), N0], add_0_input_name_0, add_0_input_name_1, "add_0_output"
+            [Ms0_IntVar, N0],
+            add_0_input_name_0,
+            add_0_input_name_1,
+            "add_0_output",
+            dtype,
         )
         add_1_input_name_0 = "add_1_input_0"
         add_1_input_name_1 = "add_1_input_1"
         add_output1 = self._make_add(
-            [list(Ms1), N1], add_1_input_name_0, add_1_input_name_1, "add_1_output"
+            [Ms1_IntVar, N1],
+            add_1_input_name_0,
+            add_1_input_name_1,
+            "add_1_output",
+            dtype,
         )
 
         input_tensors = [add_output0, add_output1, add_output0]
@@ -679,6 +722,7 @@ def _test_strided_scatter_input_masks_2(
             input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_op = ops.concatenate()
         Y = concat_op(slice_outputs, scatter_dim)
@@ -707,11 +751,11 @@ def _test_strided_scatter_input_masks_2(
 
         for M0, M1 in zip(Ms0, Ms1):
             # Run PyTorch
-            add_0_0_pt = torch.randn(M0, N0).cuda().half()
-            add_0_1_pt = torch.randn(M0, N0).cuda().half()
+            add_0_0_pt = get_random_torch_tensor([M0, N0], dtype)
+            add_0_1_pt = get_random_torch_tensor([M0, N0], dtype)
             add_0_output_pt = add_0_0_pt + add_0_1_pt
-            add_1_0_pt = torch.randn(M1, N1).cuda().half()
-            add_1_1_pt = torch.randn(M1, N1).cuda().half()
+            add_1_0_pt = get_random_torch_tensor([M1, N1], dtype)
+            add_1_1_pt = get_random_torch_tensor([M1, N1], dtype)
             add_1_output_pt = add_1_0_pt + add_1_1_pt
 
             slice_outputs_pt = []
@@ -729,7 +773,7 @@ def _test_strided_scatter_input_masks_2(
                 add_1_input_name_0: add_1_0_pt,
                 add_1_input_name_1: add_1_1_pt,
             }
-            y = torch.empty(y_pt.size()).cuda().half()
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
             self.test_count += 1
@@ -764,8 +808,9 @@ def _test_strided_scatter_with_split(
         end_indices,
         scatter_dim,
         test_name,
+        dtype="float16",
     ):
-        logger.info(
+        _LOGGER.info(
             f"test_strided_scatter_with_split with {start_indices=}, {end_indices=}"
         )
 
@@ -773,12 +818,12 @@ def _test_strided_scatter_with_split(
         input_Add_0_name = "input_add_0"
         input_Add_1_name = "input_add_1"
         add_output = self._make_add(
-            add_input_shape, input_Add_0_name, input_Add_1_name, "add_output"
+            add_input_shape, input_Add_0_name, input_Add_1_name, "add_output", dtype
         )
 
         # make split
         split_input_name = "split_input"
-        split_input = self._make_tensor(split_input_shape, split_input_name)
+        split_input = self._make_tensor(split_input_shape, split_input_name, dtype)
         split_dim_size = split_input_shape[scatter_dim]
         split_outputs = ops.split()(
             split_input, int(split_dim_size / 2), dim=scatter_dim
@@ -790,6 +835,7 @@ def _test_strided_scatter_with_split(
             slice_input_tensors,
             start_indices,
             end_indices,
+            dtype,
         )
         concat_inputs = [add_output] + slice_outputs + list(split_outputs)
         concat_op = ops.concatenate()
@@ -818,11 +864,11 @@ def _test_strided_scatter_with_split(
         np.testing.assert_equal(concat_op._attrs["input_masks"], input_masks)
 
         # Run PyTorch
-        input_add_0_pt = torch.randn(add_input_shape).cuda().half()
-        input_add_1_pt = torch.randn(add_input_shape).cuda().half()
+        input_add_0_pt = get_random_torch_tensor(add_input_shape, dtype)
+        input_add_1_pt = get_random_torch_tensor(add_input_shape, dtype)
         add_output_pt = input_add_0_pt + input_add_1_pt
 
-        split_input_pt = torch.randn(split_input_shape).cuda().half()
+        split_input_pt = get_random_torch_tensor(split_input_shape, dtype)
         split_outputs_pt = torch.split(
             split_input_pt, int(split_dim_size / 2), dim=scatter_dim
         )
@@ -832,7 +878,7 @@ def _test_strided_scatter_with_split(
         for input_shape, s_indices, e_indices in zip(
             slice_input_shapes, start_indices, end_indices
         ):
-            x_pt = torch.randn(input_shape).cuda().half()
+            x_pt = get_random_torch_tensor(input_shape, dtype)
             xs_pt.append(x_pt)
             slice_indices = [slice(i, j) for i, j in zip(s_indices, e_indices)]
             slice_output_pt = x_pt[slice_indices]
@@ -849,7 +895,7 @@ def _test_strided_scatter_with_split(
             inputs[f"input_{i}"] = x_pt
 
         # run ait
-        y = torch.empty(y_pt.size()).cuda().half()
+        y = get_torch_empty_tensor(y_pt.size(), dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
         self.test_count += 1
@@ -865,6 +911,122 @@ def test_strided_scatter_with_split(self):
             test_name="strided_scatter_with_split",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_strided_scatter_float(self):
+        self._test_strided_scatter_basic(
+            input_shapes=([2], [3]),
+            start_indices=([1], [2]),
+            end_indices=([2], [-1]),
+            scatter_dim=0,
+            test_name="strided_scatter_basic_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_basic(
+            input_shapes=([10, 30, 20], [10, 8, 20], [10, 10, 20]),
+            start_indices=([0, 5, 0], [0, 6, 0], [0, 1, 0]),
+            end_indices=([None, 6, None], [None, 8, None], [None, 4, None]),
+            scatter_dim=1,
+            test_name="strided_scatter_basic_float",
+            dtype="float",
+        )
+        dynamic_dim_1 = gen_IntVar([5, 16], name="dynamic_dim_1")
+        dynamic_dim_2 = gen_IntVar([10, 20], name="dynamic_dim_2")
+        self._test_strided_scatter_dynamic(
+            input_shapes=(
+                [dynamic_dim_1, dynamic_dim_2, 4],
+                [dynamic_dim_1, dynamic_dim_2, 10],
+            ),
+            start_indices=([0, 0, 2], [0, 0, 2]),
+            end_indices=([None, None, 4], [None, None, 10]),
+            scatter_dim=2,
+            test_name="strided_scatter_dynamic_float",
+            dtype="float",
+        )
+        dynamic_dim_3 = gen_IntVar([5, 7], name="dynamic_dim_3")
+        dynamic_dim_4 = gen_IntVar([1, 10], name="dynamic_dim_4")
+        self._test_strided_scatter_dynamic(
+            input_shapes=(
+                [dynamic_dim_3, dynamic_dim_4, 4],
+                [dynamic_dim_3, dynamic_dim_4, 6],
+                [dynamic_dim_3, dynamic_dim_4, 8],
+            ),
+            start_indices=([0, 0, 2], [0, 0, 4], [0, 0, 6]),
+            end_indices=([None, None, 4], [None, None, 6], [None, None, 8]),
+            scatter_dim=2,
+            test_name="strided_scatter_partial_float",
+            make_slices=[True, False, True],
+            dtype="float",
+        )
+        self._test_strided_scatter_multi_dsts_2(
+            input_shapes=[[3, 3, 10], [3, 7, 10]],
+            start_indices=[[0, 1, 0], [0, 1, 0]],
+            end_indices=[[None, 2, None], [None, 7, None]],
+            scatter_dim=1,
+            test_name="strided_scatter_partial_float",
+            dtype="float",
+        )
+        self._test_strided_scatter_basic_2(
+            input_shape_0=(1, 10),
+            input_shape_2=(1, 8),
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input_2
+                [None, 8],  # input_0
+                [None, 4],  # input_2
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_basic_float_2",
+            dtype="float",
+        )
+        self._test_strided_scatter_input_masks_2(
+            Ms0=(4, 10),
+            N0=6,
+            Ms1=(4, 10),
+            N1=7,
+            start_indices=(
+                [0, 0],
+                [0, 0],
+                [0, 0],
+            ),
+            end_indices=(
+                [None, 2],  # input0
+                [None, 5],  # input1
+                [None, 4],  # input0
+            ),
+            scatter_dim=1,
+            test_name="strided_scatter_input_masks_float_2",
+            dtype="float",
+        )
+        self._test_strided_scatter_with_split(
+            add_input_shape=(4, 10),
+            split_input_shape=(4, 9),
+            slice_input_shapes=([4, 6], [4, 12]),
+            start_indices=([0, 2], [0, 8]),
+            end_indices=([None, 4], [None, 12]),
+            scatter_dim=1,
+            test_name="strided_scatter_with_split_float",
+            dtype="float",
+        )
+        target = detect_target()
+        if int(target._arch) >= 80:
+            dynamic_dim_1 = gen_IntVar([5, 16], name="Ms")
+            self._test_strided_scatter_input_masks(
+                Ms=(5, 16),
+                N=4,
+                K=10,
+                input_shapes=([dynamic_dim_1, 5], [dynamic_dim_1, 10]),
+                start_indices=([0, 1], [0, 2]),
+                end_indices=([None, 2], [None, 10]),
+                scatter_dim=1,
+                test_name="strided_scatter_input_masks_float",
+                make_slices=[True, True],
+                dtype="float",
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_strided_split_group_gemm.py b/tests/unittest/compiler/test_strided_split_group_gemm.py
index dcafe5231..a822179ec 100644
--- a/tests/unittest/compiler/test_strided_split_group_gemm.py
+++ b/tests/unittest/compiler/test_strided_split_group_gemm.py
@@ -21,14 +21,20 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class StridedSplitGroupGemmTestCase(unittest.TestCase):
-    def test_split_group_gemm(self):
+    def _test_split_group_gemm(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -43,13 +49,13 @@ def test_split_group_gemm(self):
 
         X = Tensor(
             shape=[IntImm(M), IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="x",
             is_input=True,
         )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -72,10 +78,10 @@ def test_split_group_gemm(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
@@ -84,7 +90,7 @@ def test_split_group_gemm(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = [
@@ -93,14 +99,14 @@ def test_split_group_gemm(self):
             W2_pt,
             W3_pt,
         ]
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_bias(self):
+    def _test_split_group_gemm_bias(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -113,15 +119,13 @@ def test_split_group_gemm_bias(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
-        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype=dtype, name="b3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -149,13 +153,13 @@ def test_split_group_gemm_bias(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
-        B3_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
+        B3_pt = get_random_torch_tensor([N], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
@@ -164,7 +168,7 @@ def test_split_group_gemm_bias(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         input_name_to_index = module.get_input_name_to_index_map()
@@ -176,14 +180,14 @@ def test_split_group_gemm_bias(self):
         inputs[input_name_to_index["b1"]] = B1_pt
         inputs[input_name_to_index["b2"]] = B2_pt
         inputs[input_name_to_index["b3"]] = B3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_reorder(self):
+    def _test_split_group_gemm_reorder(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -196,12 +200,10 @@ def test_split_group_gemm_reorder(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -224,10 +226,10 @@ def test_split_group_gemm_reorder(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
@@ -236,7 +238,7 @@ def test_split_group_gemm_reorder(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         inputs = [0 for i in range(4)]
@@ -245,14 +247,14 @@ def test_split_group_gemm_reorder(self):
         inputs[name_to_idx["w1"]] = W1_pt
         inputs[name_to_idx["w2"]] = W2_pt
         inputs[name_to_idx["w3"]] = W3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_split_group_gemm_bias_reorder(self):
+    def _test_split_group_gemm_bias_reorder(self, dtype="float16"):
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
 
         K1 = 32
@@ -265,15 +267,13 @@ def test_split_group_gemm_bias_reorder(self):
 
         dim = 1
 
-        X = Tensor(
-            shape=[IntImm(M), IntImm(K)], dtype="float16", name="x", is_input=True
-        )
-        W1 = Tensor(shape=[N, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N, K2], dtype="float16", name="w2", is_input=True)
-        W3 = Tensor(shape=[N, K3], dtype="float16", name="w3", is_input=True)
-        B1 = Tensor(shape=[N], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N], dtype="float16", name="b2", is_input=True)
-        B3 = Tensor(shape=[N], dtype="float16", name="b3", is_input=True)
+        X = Tensor(shape=[IntImm(M), IntImm(K)], dtype=dtype, name="x", is_input=True)
+        W1 = Tensor(shape=[N, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N, K2], dtype=dtype, name="w2", is_input=True)
+        W3 = Tensor(shape=[N, K3], dtype=dtype, name="w3", is_input=True)
+        B1 = Tensor(shape=[N], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N], dtype=dtype, name="b2", is_input=True)
+        B3 = Tensor(shape=[N], dtype=dtype, name="b3", is_input=True)
 
         split_op = ops.split()
         X1, X2, X3 = split_op(X, [K1, K2, K3], dim)
@@ -301,13 +301,13 @@ def test_split_group_gemm_bias_reorder(self):
             expected_inputs_group_gemm_op, group_gemm_op._attrs["inputs"]
         )
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W1_pt = torch.randn(N, K1).cuda().half()
-        W2_pt = torch.randn(N, K2).cuda().half()
-        W3_pt = torch.randn(N, K3).cuda().half()
-        B1_pt = torch.randn(N).cuda().half()
-        B2_pt = torch.randn(N).cuda().half()
-        B3_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W1_pt = get_random_torch_tensor([N, K1], dtype)
+        W2_pt = get_random_torch_tensor([N, K2], dtype)
+        W3_pt = get_random_torch_tensor([N, K3], dtype)
+        B1_pt = get_random_torch_tensor([N], dtype)
+        B2_pt = get_random_torch_tensor([N], dtype)
+        B3_pt = get_random_torch_tensor([N], dtype)
         X1_pt, X2_pt, X3_pt = torch.split(X_pt, [K1, K2, K3], dim)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
@@ -316,7 +316,7 @@ def test_split_group_gemm_bias_reorder(self):
         Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         input_name_to_index = module.get_input_name_to_index_map()
@@ -328,10 +328,23 @@ def test_split_group_gemm_bias_reorder(self):
         inputs[input_name_to_index["b1"]] = B1_pt
         inputs[input_name_to_index["b2"]] = B2_pt
         inputs[input_name_to_index["b3"]] = B3_pt
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_split_group_gemm_float16(self):
+        self._test_split_group_gemm()
+        self._test_split_group_gemm_bias()
+        self._test_split_group_gemm_reorder()
+        self._test_split_group_gemm_bias_reorder()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_group_gemm_float(self):
+        self._test_split_group_gemm(dtype="float")
+        self._test_split_group_gemm_bias(dtype="float")
+        self._test_split_group_gemm_reorder(dtype="float")
+        self._test_split_group_gemm_bias_reorder(dtype="float")
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/compiler/test_strided_view_cat.py b/tests/unittest/compiler/test_strided_view_cat.py
index 9b1636767..fa270820d 100644
--- a/tests/unittest/compiler/test_strided_view_cat.py
+++ b/tests/unittest/compiler/test_strided_view_cat.py
@@ -18,17 +18,35 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.compiler.base import IntVar
+from aitemplate.compiler.base import IntImm, IntVar, Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import graph_utils
 from parameterized import param, parameterized
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-2, "rtol": 1e-2},
+    "float32": {"atol": 1e-2, "rtol": 1e-2},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 def custom_name_func(testcase_func, param_num, param):
     return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
 
 
 class StridedViewCatOpTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     @parameterized.expand(
         [
             param(
@@ -36,40 +54,32 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
-            ),
-            param(
-                "gemm_reshape_cat_fusible_expand_1",
-                n=2,
-                new_shape=[-1, 2, 1, 2],
-                cat_dim=3,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=10,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_2",
                 n=4,
                 new_shape=[-1, 4, 4, 1],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=10,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_3",
                 n=2,
                 new_shape=[-1, 2, 2, 1],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=10,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_fusible_expand_4",
                 n=4,
                 new_shape=[-1, 4, 2, 2],
                 cat_dim=2,
-                expected_num_tensors=12,
-                expected_num_ops=10,
+                expected_num_tensors=10,
+                expected_num_ops=9,
             ),
             param(
                 "gemm_reshape_cat_non_fusible_dynamic_dim",
@@ -84,16 +94,26 @@ class StridedViewCatOpTestCase(unittest.TestCase):
                 n=2,
                 new_shape=[-1, 2 * 2],
                 cat_dim=1,
-                expected_num_tensors=15,
-                expected_num_ops=10,
+                expected_num_tensors=14,
+                expected_num_ops=9,
             ),
             param(
-                "gemm_reshape_cat_non_fusible_expand",
-                n=4,
-                new_shape=[-1, 4, 2, 2],
+                # Concat along rightmost unsqueezed dim - not fusible.
+                "gemm_reshape_cat_non_fusible_stride_dim_rightmost_unsqueezed",
+                n=2,
+                new_shape=[-1, 2, 2, 1],
                 cat_dim=3,
-                expected_num_tensors=17,
-                expected_num_ops=10,
+                expected_num_tensors=16,
+                expected_num_ops=9,
+            ),
+            param(
+                # Concat along inner unsqueezed dim - fusible.
+                "gemm_reshape_cat_fusible_stride_dim_inner_unsqueezed",
+                n=2,
+                new_shape=[-1, 2, 1, 2],
+                cat_dim=2,
+                expected_num_tensors=10,
+                expected_num_ops=9,
             ),
         ],
         name_func=custom_name_func,
@@ -106,24 +126,81 @@ def test_strided_gemm_view_cat_fusible(
         cat_dim: int,
         expected_num_tensors: int,
         expected_num_ops: int,
+        dtype: str = "float16",
+    ):
+        self._test_strided_gemm_view_cat_fusible(
+            test_name,
+            n,
+            new_shape,
+            cat_dim,
+            expected_num_tensors,
+            expected_num_ops,
+            dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_strided_gemm_view_cat_fusible_dtype(self, dtype):
+        self._test_strided_gemm_view_cat_fusible(
+            f"gemm_reshape_cat_non_fusible_expand_{dtype}",
+            n=4,
+            new_shape=[-1, 4, 2, 2],
+            cat_dim=3,
+            expected_num_tensors=16,
+            expected_num_ops=9,
+            dtype=dtype,
+        )
+        self._test_strided_gemm_view_cat_fusible(
+            f"gemm_reshape_cat_fusible_expand_{dtype}",
+            n=2,
+            new_shape=[-1, 2, 1, 2],
+            cat_dim=3,
+            expected_num_tensors=10,
+            expected_num_ops=9,
+            dtype=dtype,
+        )
+
+    def _test_strided_gemm_view_cat_fusible(
+        self,
+        test_name: str,
+        n: int,
+        new_shape: List[int],
+        cat_dim: int,
+        expected_num_tensors: int,
+        expected_num_ops: int,
+        dtype: str = "float16",
     ):
+        target = detect_target()
+
         batch_dim = IntVar([1, 2, 3], "batch_size")
-        input0 = test_utils.gen_input_tensor([batch_dim, n, n], name="input0")
-        input1 = test_utils.gen_input_tensor([n, n], name="input1")
-        input2 = test_utils.gen_input_tensor([batch_dim, n, n], name="input2")
-        input3 = test_utils.gen_input_tensor([n], name="input3")
-        input4 = test_utils.gen_input_tensor([batch_dim, n, n], name="input4")
-        input5 = test_utils.gen_input_tensor([batch_dim, n, n], name="input5")
-        input6 = test_utils.gen_input_tensor([n, n, n], name="input6")
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input0", dtype=dtype
+        )
+        input1 = test_utils.gen_input_tensor([n, n], name="input1", dtype=dtype)
+        input2 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input2", dtype=dtype
+        )
+        input3 = test_utils.gen_input_tensor([n], name="input3", dtype=dtype)
+        input4 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input4", dtype=dtype
+        )
+        input5 = test_utils.gen_input_tensor(
+            [batch_dim, n, n], name="input5", dtype=dtype
+        )
+        input6 = test_utils.gen_input_tensor([n, n, n], name="input6", dtype=dtype)
 
         X0 = ops.gemm_rcr()(input0, input1)
         X1 = ops.gemm_rcr_bias()(input0, input1, input3)
         X2 = ops.gemm_rcr_bias_add()(input0, input1, input3, input4)
         X3 = ops.gemm_rcr_bias_add_add()(input0, input1, input3, input4, input4)
         X4 = ops.bmm_rcr()(input0, input2)
-
-        # For now these ops do not support output_accessors yet.
-        # TODO: enable these checks once these ops support output_accessors.
         X5 = ops.bmm_rrr_add()(input0, input2, input3)
 
         # [m, b, k] x [b, n, k] -> [m, b, n] b = n, k = n
@@ -138,7 +215,6 @@ def test_strided_gemm_view_cat_fusible(
         Z._attrs["is_output"] = True
 
         # Gen module.
-        target = detect_target()
         module = compile_model([Z], target, "./tmp", test_name)
 
         # Verify the generated graph.
@@ -149,13 +225,13 @@ def test_strided_gemm_view_cat_fusible(
 
         # Prepare PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input1_pt = torch.randn([n, n]).cuda().half()
-            input2_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input3_pt = torch.randn([n]).cuda().half()
-            input4_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input5_pt = torch.randn([batch_size, n, n]).cuda().half()
-            input6_pt = torch.randn([n, n, n]).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input1_pt = get_random_torch_tensor([n, n], dtype)
+            input2_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input3_pt = get_random_torch_tensor([n], dtype)
+            input4_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input5_pt = get_random_torch_tensor([batch_size, n, n], dtype)
+            input6_pt = get_random_torch_tensor([n, n, n], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.nn.functional.linear(input0_pt, input1_pt)
@@ -179,7 +255,7 @@ def test_strided_gemm_view_cat_fusible(
             ys_pt = [torch.reshape(x, new_shape) for x in xs_pt]
             ys_pt.insert(2, torch.reshape(input5_pt, new_shape))
             z_pt = torch.cat(ys_pt, dim=cat_dim)
-            z = torch.empty(z_pt.shape).cuda().half()
+            z = get_torch_empty_tensor(z_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors(
@@ -196,10 +272,124 @@ def test_strided_gemm_view_cat_fusible(
             )
 
             # Do comparisons.
-            self.assertTrue(
-                torch.allclose(z, z_pt, atol=1e-2, rtol=1e-2),
-                f"batch_size: {batch_size}, z: {z}, z_pt: {z_pt}, input5_pt: {input5_pt}",
+            torch.testing.assert_close(z, z_pt, **_TOLERANCE_LIMITS[dtype])
+
+    def _test_strided_layernorm_view_cat_fusible(self, dtype="float16"):
+        def _create_layernorm_sigmoid_mul(
+            input: Tensor,
+            normalized_shape: List[int],
+            gamma: Tensor = None,
+            beta: Tensor = None,
+        ) -> Tensor:
+            X1 = ops.layernorm([IntImm(s) for s in normalized_shape])(
+                input, gamma, beta
             )
+            X2 = ops.elementwise(FuncEnum.SIGMOID)(X1)
+            X3 = ops.elementwise(FuncEnum.MUL)(X2, input)
+            return X3
+
+        batch_dim = IntVar([1, 2, 3], "batch_size")
+        m = 5
+        n = 10
+        new_shape = [-1, m, n * 2]
+        cat_dim = 1
+        # layernorm + reshape
+        input0 = test_utils.gen_input_tensor(
+            [batch_dim, m, 2, n], name="input0", dtype=dtype
+        )
+        # group layernorm + reshape
+        gamma = test_utils.gen_input_tensor([m * n], name="g", dtype=dtype)
+        beta = test_utils.gen_input_tensor([m * n], name="b", dtype=dtype)
+        input1 = test_utils.gen_input_tensor(
+            [batch_dim, 2, m * n], name="input1", dtype=dtype
+        )
+        input2 = test_utils.gen_input_tensor(
+            [batch_dim, 2, m * n], name="input2", dtype=dtype
+        )
+        # layernorm + nop reshape
+        input3 = test_utils.gen_input_tensor(
+            [batch_dim, m, n * 2], name="input3", dtype=dtype
+        )
+
+        X0 = _create_layernorm_sigmoid_mul(input0, [n])
+        X1 = _create_layernorm_sigmoid_mul(input1, [m * n], gamma, beta)
+        X2 = _create_layernorm_sigmoid_mul(input2, [m * n], gamma, beta)
+        X3 = _create_layernorm_sigmoid_mul(input3, [n * 2])
+        Xs = [X0, X1, X2, X3]
+        Ys = [ops.reshape()(X, new_shape) for X in Xs]
+        Z = ops.concatenate()(Ys, dim=cat_dim)
+
+        Z._attrs["name"] = "output0"
+        Z._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        module = compile_model(
+            Z, target, "./tmp", f"strided_layernorm_view_cat_fusion_{dtype}"
+        )
+
+        # Verify the generated graph.
+        sorted_graph = module.debug_sorted_graph
+        self.assertEqual(len(sorted_graph), 7)
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+
+        # Prepare PyTorch tensors.
+        for batch_size in batch_dim._attrs["values"]:
+            input0_pt = get_random_torch_tensor([batch_size, m, 2, n], dtype)
+            input1_pt = get_random_torch_tensor([batch_size, 2, m * n], dtype)
+            input2_pt = get_random_torch_tensor([batch_size, 2, m * n], dtype)
+            gamma_pt = get_random_torch_tensor([m * n], dtype)
+            beta_pt = get_random_torch_tensor([m * n], dtype)
+            input3_pt = get_random_torch_tensor([batch_size, m, n * 2], dtype)
+
+            # Run PyTorch baseline.
+            x0_pt = torch.nn.functional.layer_norm(input0_pt, [n])
+            x0_pt = torch.mul(input0_pt, torch.sigmoid(x0_pt))
+            x1_pt = torch.nn.functional.layer_norm(
+                input1_pt, [m * n], weight=gamma_pt, bias=beta_pt
+            )
+            x1_pt = torch.mul(input1_pt, torch.sigmoid(x1_pt))
+            x2_pt = torch.nn.functional.layer_norm(
+                input2_pt, [m * n], weight=gamma_pt, bias=beta_pt
+            )
+            x2_pt = torch.mul(input2_pt, torch.sigmoid(x2_pt))
+            x3_pt = torch.nn.functional.layer_norm(input3_pt, [n * 2])
+            x3_pt = torch.mul(input3_pt, torch.sigmoid(x3_pt))
+
+            xs_pt = [x0_pt, x1_pt, x2_pt, x3_pt]
+            ys_pt = [torch.reshape(x, new_shape) for x in xs_pt]
+            z_pt = torch.cat(ys_pt, dim=cat_dim)
+            z = get_torch_empty_tensor(z_pt.shape, dtype)
+
+            # Run AITemplate module.
+            module.run_with_tensors(
+                {
+                    "input0": input0_pt,
+                    "input1": input1_pt,
+                    "input2": input2_pt,
+                    "input3": input3_pt,
+                    "g": gamma_pt,
+                    "b": beta_pt,
+                },
+                [z],
+            )
+
+            # Do comparisons.
+            for x, x_pt in zip(z, z_pt):
+                torch.testing.assert_close(x, x_pt, **_TOLERANCE_LIMITS[dtype])
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [],
+            }
+        )
+    )
+    def test_strided_layernorm_view_cat_fusible(self, dtype):
+        self._test_strided_layernorm_view_cat_fusible(dtype)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_strided_view_op.py b/tests/unittest/compiler/test_strided_view_op.py
index 38bf9e7a0..2f56ca01f 100644
--- a/tests/unittest/compiler/test_strided_view_op.py
+++ b/tests/unittest/compiler/test_strided_view_op.py
@@ -15,7 +15,7 @@
 import unittest
 from functools import partial
 
-from typing import Callable, Dict, List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 
@@ -24,6 +24,10 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 from parameterized import param, parameterized
@@ -31,33 +35,61 @@
 
 def _gen_simple_strided_ops(
     batch_dim: IntVar, n1: int, n2: int
-) -> List[Tuple[Tensor, Callable[[torch.Tensor], torch.Tensor]]]:
-    return [
+) -> List[Tuple[str, Tensor, str, Callable[[torch.Tensor], torch.Tensor]]]:
+    test_cases = [
         (
             "tanh",
             ops.elementwise(FuncEnum.TANH)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             torch.tanh,
         ),
         (
             "layernorm",
             ops.layernorm(normalized_shape=[IntImm(n2)])(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             partial(torch.nn.functional.layer_norm, normalized_shape=[n2]),
         ),
         (
             "sum",
             ops.reduce_sum(2, keepdim=True)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2])
+                test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float16")
             ),
+            "float16",
             partial(torch.sum, dim=2, keepdim=True),
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda":
+        test_cases.append(
+            (
+                "tanh",
+                ops.elementwise(FuncEnum.TANH)(
+                    test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float")
+                ),
+                "float",
+                torch.tanh,
+            )
+        )
+        test_cases.append(
+            (
+                "sum",
+                ops.reduce_sum(2, keepdim=True)(
+                    test_utils.gen_input_tensor([batch_dim, n1, n2], dtype="float")
+                ),
+                "float",
+                partial(torch.sum, dim=2, keepdim=True),
+            )
+        )
+    return test_cases
 
 
-def _gen_fusible_view_ops_after_strided_op() -> Dict[str, Callable[[Tensor], Tensor]]:
+def _gen_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def reshape_op(input_tensor: Tensor):
         shape = input_tensor._attrs["shape"]
         return ops.reshape()(
@@ -68,12 +100,19 @@ def reshape_op(input_tensor: Tensor):
     def flatten_op(input_tensor: Tensor):
         return ops.flatten(start_dim=1, end_dim=-1)(input_tensor)
 
-    return {"reshape": reshape_op, "flatten": flatten_op}
+    test_cases = [
+        ("reshape", reshape_op, "float16"),
+        ("flatten", flatten_op, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_cases.append(("reshape", reshape_op, "float"))
+    return test_cases
 
 
-def _gen_non_fusible_view_ops_after_strided_op() -> Dict[
-    str, Callable[[Tensor], Tensor]
-]:
+def _gen_non_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def reshape_op(input_tensor: Tensor):
         n2 = input_tensor._attrs["shape"][2].value()
         return ops.reshape()(input_tensor, [-1, n2])
@@ -81,12 +120,19 @@ def reshape_op(input_tensor: Tensor):
     def flatten_op(input_tensor: Tensor):
         return ops.flatten(start_dim=0, end_dim=1)(input_tensor)
 
-    return {"reshape": reshape_op, "flatten": flatten_op}
+    test_cases = [
+        ("reshape", reshape_op, "float16"),
+        ("flatten", flatten_op, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda":
+        test_cases.append(("flatten", flatten_op, "float"))
+    return test_cases
 
 
-def _gen_multiple_fusible_view_ops_after_strided_op() -> Dict[
-    str, Callable[[Tensor], Tensor]
-]:
+def _gen_multiple_fusible_view_ops_after_strided_op() -> (
+    List[Tuple[str, Callable[[Tensor], Tensor], str]]
+):
     def _get_shape(input_tensor: Tensor):
         return (
             input_tensor._attrs["shape"][1].value(),
@@ -107,31 +153,39 @@ def squeeze_unsqueeze(input_tensor: Tensor):
         n1, n2 = _get_shape(input_tensor)
         return ops.squeeze(dim=1)(ops.unsqueeze(dim=1)(input_tensor))
 
-    return {
-        "multi_reshape": multi_reshape,
-        "squeeze_unsqueeze": squeeze_unsqueeze,
-    }
+    test_cases = [
+        ("multi_reshape", multi_reshape, "float16"),
+        ("squeeze_unsqueeze", squeeze_unsqueeze, "float16"),
+    ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_cases.append(("multi_reshape", multi_reshape, "float"))
+    return test_cases
 
 
 def custom_name_func(testcase_func, param_num, param):
-    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}"
+    return f"{testcase_func.__name__}_{param_num}_{param.args[0]}_{param.args[2]}"
 
 
 class StridedViewOpTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(StridedViewOpTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     @parameterized.expand(
         [
-            param(f"single_gemm_{name}_fusion", func)
-            for (name, func) in _gen_fusible_view_ops_after_strided_op().items()
+            param(f"single_gemm_{name}_fusion_{dtype}", func, dtype)
+            for (name, func, dtype) in _gen_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_gemm_and_view_fusible(self, test_name, func):
+    def test_single_gemm_and_view_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
         K = 10
-        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
-        input1 = test_utils.gen_input_tensor([N2, K])
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K], dtype=dtype)
+        input1 = test_utils.gen_input_tensor([N2, K], dtype=dtype)
         X0 = ops.gemm_rcr()(input0, input1)
         Y = ops.elementwise(FuncEnum.TANH)(func(X0))
         Y._attrs["name"] = "output0"
@@ -139,7 +193,8 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -149,8 +204,8 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
-            input1_pt = torch.randn(N2, K).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N1, K], dtype)
+            input1_pt = get_random_torch_tensor([N2, K], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.matmul(input0_pt, input1_pt.transpose(0, 1))
@@ -160,31 +215,33 @@ def test_single_gemm_and_view_fusible(self, test_name, func):
                     x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
                 )
             )
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input0_pt, input1_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_bmm_{name}_fusion", func)
+            param(f"single_bmm_{name}_fusion_{dtype}", func, dtype)
             for (
                 name,
                 func,
-            ) in _gen_multiple_fusible_view_ops_after_strided_op().items()
+                dtype,
+            ) in _gen_multiple_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_bmm_and_multi_view_fusible(self, test_name, func):
+    def test_single_bmm_and_multi_view_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
         K = 10
-        input0 = test_utils.gen_input_tensor([batch_dim, N1, K])
-        input1 = test_utils.gen_input_tensor([batch_dim, K, N2])
+        input0 = test_utils.gen_input_tensor([batch_dim, N1, K], dtype)
+        input1 = test_utils.gen_input_tensor([batch_dim, K, N2], dtype)
         X0 = ops.bmm_rrr()(input0, input1)
         Y = ops.elementwise(FuncEnum.COS)(func(X0))
         Y._attrs["name"] = "output0"
@@ -192,7 +249,8 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -202,8 +260,8 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input0_pt = torch.randn(batch_size, N1, K).cuda().half()
-            input1_pt = torch.randn(batch_size, K, N2).cuda().half()
+            input0_pt = get_random_torch_tensor([batch_size, N1, K], dtype)
+            input1_pt = get_random_torch_tensor([batch_size, K, N2], dtype)
 
             # Run PyTorch baseline.
             x0_pt = torch.matmul(input0_pt, input1_pt)
@@ -213,24 +271,32 @@ def test_single_bmm_and_multi_view_fusible(self, test_name, func):
                     x0_pt, test_utils.get_shape(Y._attrs["shape"], dim_to_value_dict)
                 )
             )
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input0_pt, input1_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_{op_name}_reshape_fusion", input_tensor, torch_func)
-            for (op_name, input_tensor, torch_func) in _gen_simple_strided_ops(
+            param(
+                f"single_{op_name}_reshape_fusion_{dtype}",
+                input_tensor,
+                dtype,
+                torch_func,
+            )
+            for (op_name, input_tensor, dtype, torch_func) in _gen_simple_strided_ops(
                 IntVar([1, 128, 256], "batch_size"), n1=10, n2=8
             )
         ],
         name_func=custom_name_func,
     )
-    def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
+    def test_single_op_and_view_fusible(
+        self, test_name, input_tensor, dtype, torch_func
+    ):
         src_input = test_utils.get_src_input(input_tensor)
         batch_dim = src_input._attrs["shape"][0]
         n1 = src_input._attrs["shape"][1].value()
@@ -241,7 +307,8 @@ def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -251,30 +318,31 @@ def test_single_op_and_view_fusible(self, test_name, input_tensor, torch_func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, n1, n2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, n1, n2], dtype)
 
             # Run PyTorch baseline.
             y_pt = torch.tanh(torch.reshape(torch_func(input_pt), [batch_size, -1]))
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
     @parameterized.expand(
         [
-            param(f"single_op_{name}_non_fusion", func)
-            for (name, func) in _gen_non_fusible_view_ops_after_strided_op().items()
+            param(f"single_op_{name}_non_fusion_{dtype}", func, dtype)
+            for (name, func, dtype) in _gen_non_fusible_view_ops_after_strided_op()
         ],
         name_func=custom_name_func,
     )
-    def test_single_op_and_view_non_fusible(self, test_name, func):
+    def test_single_op_and_view_non_fusible(self, test_name, func, dtype):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype=dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y = ops.elementwise(FuncEnum.TANH)(func(X1))
         Y._attrs["name"] = "output"
@@ -282,7 +350,8 @@ def test_single_op_and_view_non_fusible(self, test_name, func):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y], target, "./tmp", test_name)
+        dll_name = f"test_{self._test_id}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -292,23 +361,24 @@ def test_single_op_and_view_non_fusible(self, test_name, func):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            x0_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            x0_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
 
             # Run PyTorch baseline.
             y_pt = torch.tanh(torch.reshape(torch.tanh(x0_pt), [-1, N2]))
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(y_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([x0_pt], [y])
 
             # Do comparisons.
             self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
+            self._test_id += 1
 
-    def test_two_serial_view_outputs(self):
+    def _test_two_serial_view_outputs(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y1 = ops.reshape()(X1, [-1, N1 * N2])
         Y2 = ops.reshape()(Y1, [-1, N1, N2])
@@ -319,7 +389,7 @@ def test_two_serial_view_outputs(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y1, Y2], target, "./tmp", "two_view_outputs")
+        module = compile_model([Y1, Y2], target, "./tmp", f"two_view_outputs_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -329,13 +399,13 @@ def test_two_serial_view_outputs(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
 
             # Run PyTorch baseline.
             y1_pt = torch.reshape(torch.tanh(input_pt), [batch_size, N1 * N2])
             y2_pt = torch.reshape(y1_pt, [batch_size, N1, N2])
-            y1 = torch.empty(y1_pt.shape).cuda().half()
-            y2 = torch.empty(y2_pt.shape).cuda().half()
+            y1 = get_torch_empty_tensor(y1_pt.shape, dtype)
+            y2 = get_torch_empty_tensor(y2_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y1, y2])
@@ -344,11 +414,11 @@ def test_two_serial_view_outputs(self):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_two_parallel_views(self):
+    def _test_two_parallel_views(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N1 = 8
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2])
+        X0 = test_utils.gen_input_tensor([batch_dim, N1, N2], dtype)
         X1 = ops.elementwise(FuncEnum.TANH)(X0)
         Y1 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1 * N2]))
         Y2 = ops.elementwise(FuncEnum.TANH)(ops.reshape()(X1, [-1, N1, N2]))
@@ -359,7 +429,9 @@ def test_two_parallel_views(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model([Y1, Y2], target, "./tmp", "two_parallel_view_outputs")
+        module = compile_model(
+            [Y1, Y2], target, "./tmp", f"two_parallel_view_outputs_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -369,14 +441,14 @@ def test_two_parallel_views(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            input_pt = torch.randn(batch_size, N1, N2).cuda().half()
+            input_pt = get_random_torch_tensor([batch_size, N1, N2], dtype)
             x1_pt = torch.tanh(input_pt)
 
             # Run PyTorch baseline.
             y1_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1 * N2]))
             y2_pt = torch.tanh(torch.reshape(x1_pt, [batch_size, N1, N2]))
-            y1 = torch.empty(y1_pt.shape).cuda().half()
-            y2 = torch.empty(y2_pt.shape).cuda().half()
+            y1 = get_torch_empty_tensor(y1_pt.shape, dtype)
+            y2 = get_torch_empty_tensor(y2_pt.shape, dtype)
 
             # Run AITemplate module.
             module.run_with_tensors([input_pt], [y1, y2])
@@ -385,6 +457,15 @@ def test_two_parallel_views(self):
             self.assertTrue(torch.allclose(y1, y1_pt, atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(y2, y2_pt, atol=1e-2, rtol=1e-2))
 
+    def test_two_views(self):
+        self._test_two_parallel_views()
+        self._test_two_serial_view_outputs()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_two_views_float(self):
+        self._test_two_parallel_views(dtype="float")
+        self._test_two_serial_view_outputs(dtype="float")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_symbolic.py b/tests/unittest/compiler/test_symbolic.py
new file mode 100644
index 000000000..e98c85e74
--- /dev/null
+++ b/tests/unittest/compiler/test_symbolic.py
@@ -0,0 +1,253 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import unittest
+
+import sympy
+
+from aitemplate.compiler import ops, symbolic, transform
+from aitemplate.compiler.base import IntImm, IntVar, IntVarTensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import Tensor
+
+
+class SymbolTestCase(unittest.TestCase):
+    def test_symbolic_values_existence(self):
+        imm = IntImm(value=7)
+        self.assertIsNotNone(imm._attrs.get("symbolic_value", None))
+
+        var = IntVar(values=[1, 256])
+        self.assertIsNotNone(var._attrs.get("symbolic_value", None))
+
+    def test_imm_equal(self):
+        imm1 = IntImm(value=7)
+        imm2 = IntImm(value=8)
+        imm3 = IntImm(value=7, name="dummy_name")
+
+        self.assertNotEqual(imm1, imm2)
+        self.assertEqual(imm1, imm3)
+
+    def test_var_equal(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        var2 = IntVar(values=[1, 256], name="var_2")
+        var3 = IntVar(values=[1, 256])
+
+        var1_dup = IntVar(values=[1, 256], name="var_1")
+
+        self.assertNotEqual(var1, var2)
+        self.assertNotEqual(var1, var3)
+        self.assertEqual(var1, var1_dup)
+
+    def test_new_symbol(self):
+        sym1 = symbolic.create_new_symbol(name="sym_1")  # noqa: F841
+        # Create same symbol
+        sym1_dup = symbolic.create_new_symbol(name="sym_1")  # noqa: F841
+        # Capture error if 2 symbols share the same name but different value
+        with self.assertRaises(ValueError):
+            _ = symbolic.create_new_symbol(
+                name="sym_1", values=[1, 256], check_duplicate=True
+            )
+
+        sym2 = symbolic.create_new_symbol(name="sym_2", values=[2, 32])  # noqa: F841
+        sym2_dup = symbolic.create_new_symbol(  # noqa: F841
+            name="sym_2", values=[2, 32]
+        )
+        with self.assertRaises(ValueError):
+            _ = symbolic.create_new_symbol(
+                name="sym_2", values=[1, 256], check_duplicate=True
+            )
+
+    def test_is_integer(self):
+        self.assertTrue(symbolic.is_integer(3))
+        self.assertFalse(symbolic.is_integer(3.5))
+        self.assertFalse(symbolic.is_integer("string"))
+        self.assertFalse(symbolic.is_integer([3, 4, 5]))
+
+        sym1 = sympy.Symbol("sym_1")
+        self.assertTrue(symbolic.is_integer(sym1 / sym1))
+        sym2 = 2 * sym1
+        self.assertTrue(symbolic.is_integer(sym2 / sym1))
+        sym3 = 1.5 * sym1
+        self.assertFalse(symbolic.is_integer(sym3 / sym1))
+        self.assertTrue(symbolic.is_integer(sym3 / sym1 * 2))
+
+    def test_elementwise_symbolic(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+
+        tensor1 = IntVarTensor(int_var=var1)
+        tensor2 = IntVarTensor(int_var=var2)
+
+        add = ops.elementwise(FuncEnum.ADD)(tensor1, tensor2)
+        self.assertEqual(add._attrs["symbolic_value"], sym1 + sym2)
+        sub = ops.elementwise(FuncEnum.SUB)(tensor1, tensor2)
+        self.assertEqual(sub._attrs["symbolic_value"], sym1 - sym2)
+        mul = ops.elementwise(FuncEnum.MUL)(tensor1, tensor2)
+        self.assertEqual(mul._attrs["symbolic_value"], sym1 * sym2)
+        div = ops.elementwise(FuncEnum.DIV)(tensor1, tensor2)
+        self.assertEqual(div._attrs["symbolic_value"], sym1 / sym2)
+
+    def test_dedup_symbolic_name(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        var2 = IntVar(
+            values=[1, 256], name="var_2", symbolic_value=var1.symbolic_value()
+        )
+        X_shape = [var1, var2]
+
+        X = Tensor(shape=X_shape, name="input_0", is_input=True)
+
+        self.assertNotEqual(X.shape()[0]._attrs["name"], X.shape()[1]._attrs["name"])
+        transform.dedup_symbolic_name([X])
+        self.assertEqual(X.shape()[0]._attrs["name"], X.shape()[1]._attrs["name"])
+
+
+class IntVarSymbolTestCase(unittest.TestCase):
+    def test_add(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=41)
+
+        var3 = var1 + var2
+        self.assertEqual(var3._attrs["values"], [1 + 1, 256 + 256])
+        self.assertEqual(var3.symbolic_value(), sym1 + sym2)
+
+        var4 = var1 + imm1
+        self.assertEqual(var4._attrs["values"], [1 + 37, 256 + 37])
+        self.assertEqual(var4.symbolic_value(), sym1 + 37)
+
+        imm3 = imm1 + imm2
+        self.assertEqual(imm3._attrs["values"], [37 + 41])
+        self.assertEqual(imm3.symbolic_value(), 37 + 41)
+
+    def test_radd(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 3 + var1
+        self.assertEqual(var2._attrs["values"], [3 + 1, 3 + 256])
+        self.assertEqual(var2.symbolic_value(), 3 + sym1)
+
+        imm2 = 7 + imm1
+        self.assertEqual(imm2._attrs["values"], [7 + 37])
+        self.assertEqual(imm2.symbolic_value(), 7 + 37)
+
+    def test_sub(self):
+        var1 = IntVar(values=[1, 512], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=31)
+
+        var3 = var1 - var2
+        self.assertEqual(var3._attrs["values"], [0, 511])
+        self.assertEqual(var3.symbolic_value(), sym1 - sym2)
+
+        var4 = var1 - imm1
+        self.assertEqual(var4._attrs["values"], [0, 512 - 37])
+        self.assertEqual(var4.symbolic_value(), sym1 - 37)
+
+        imm3 = imm1 - imm2
+        self.assertEqual(imm3._attrs["values"], [37 - 31])
+        self.assertEqual(imm3.symbolic_value(), 37 - 31)
+
+    def test_rsub(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 31 - var1
+        self.assertEqual(var2._attrs["values"], [0, 30])
+        self.assertEqual(var2.symbolic_value(), 31 - sym1)
+
+        imm2 = 47 - imm1
+        self.assertEqual(imm2._attrs["values"], [47 - 37])
+        self.assertEqual(imm2.symbolic_value(), 47 - 37)
+
+    def test_mul(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[1, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=37)
+        imm2 = IntImm(value=41)
+
+        var3 = var1 * var2
+        self.assertEqual(var3._attrs["values"], [1 * 1, 256 * 256])
+        self.assertEqual(var3.symbolic_value(), sym1 * sym2)
+
+        var4 = var1 * imm1
+        self.assertEqual(var4._attrs["values"], [1 * 37, 256 * 37])
+        self.assertEqual(var4.symbolic_value(), sym1 * 37)
+
+        imm3 = imm1 * imm2
+        self.assertEqual(imm3._attrs["values"], [37 * 41])
+        self.assertEqual(imm3.symbolic_value(), 37 * 41)
+
+    def test_rmul(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=37)
+
+        var2 = 3 * var1
+        self.assertEqual(var2._attrs["values"], [3 * 1, 3 * 256])
+        self.assertEqual(var2.symbolic_value(), 3 * sym1)
+
+        imm2 = 7 * imm1
+        self.assertEqual(imm2._attrs["values"], [7 * 37])
+        self.assertEqual(imm2.symbolic_value(), 7 * 37)
+
+    def test_div(self):
+        var1 = IntVar(values=[4, 512], name="var_1")
+        sym1 = var1.symbolic_value()
+        var2 = IntVar(values=[2, 256], name="var_2")
+        sym2 = var2.symbolic_value()
+        imm1 = IntImm(value=4)
+        imm2 = IntImm(value=2)
+
+        var3 = var1 / var2
+        self.assertEqual(var3._attrs["values"], [0, 256])
+        self.assertEqual(var3.symbolic_value(), sym1 / sym2)
+
+        var4 = var1 / imm1
+        self.assertEqual(var4._attrs["values"], [1, 128])
+        self.assertEqual(var4.symbolic_value(), sym1 / 4)
+
+        imm3 = imm1 / imm2
+        self.assertEqual(imm3._attrs["values"], [2])
+        self.assertEqual(imm3.symbolic_value(), 2)
+
+    def test_rdiv(self):
+        var1 = IntVar(values=[1, 256], name="var_1")
+        sym1 = var1.symbolic_value()
+        imm1 = IntImm(value=4)
+
+        var2 = 512 / var1
+        self.assertEqual(var2._attrs["values"], [2, 512])
+        self.assertEqual(var2.symbolic_value(), 512 / sym1)
+
+        imm2 = 32 / imm1
+        self.assertEqual(imm2._attrs["values"], [8])
+        self.assertEqual(imm2.symbolic_value(), 8)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_tensor.py b/tests/unittest/compiler/test_tensor.py
index 76047a78f..f29a3e18b 100644
--- a/tests/unittest/compiler/test_tensor.py
+++ b/tests/unittest/compiler/test_tensor.py
@@ -21,33 +21,38 @@
 
 from aitemplate.compiler.base import Tensor
 from aitemplate.testing import detect_target
+from parameterized import parameterized
 
 
 class TensorTestCase(unittest.TestCase):
-    def test_tensor_size(self):
-        to_torch_dtype = {
-            "bool": torch.bool,
-            "int": torch.int32,
-            "int32": torch.int32,
-            "int64": torch.int64,
-            "float16": torch.float16,
-            "float": torch.float,
-            "float32": torch.float,
-        }
-        for dtype, torch_dtype in to_torch_dtype.items():
-            x = Tensor([3], dtype=dtype, is_input=True, is_output=True)
-            x_pt = torch.randn(3).to(torch_dtype).cuda()
-
-            expected_bytes = x_pt.numel() * x_pt.element_size()
-            self.assertEqual(x.size_bytes(), expected_bytes)
-
-            mod = compile_model(
-                x, detect_target(), "./tmp", f"test_tensor_size_{dtype}"
-            )
-
-            out = torch.empty_like(x_pt)
-            mod.run_with_tensors([x_pt], [out])
-            self.assertTrue(torch.equal(out, x_pt))
+    @classmethod
+    def setUpClass(cls):
+        cls.target = detect_target()
+
+    @parameterized.expand(
+        [
+            ("bool", torch.bool),
+            ("int", torch.int32),
+            ("int32", torch.int32),
+            ("int64", torch.int64),
+            ("float16", torch.float16),
+            ("float", torch.float),
+            ("float32", torch.float),
+            ("bfloat16", torch.bfloat16),
+        ]
+    )
+    def test_tensor_size(self, dtype, torch_dtype):
+        x = Tensor([3], dtype=dtype, is_input=True, is_output=True, name="X")
+        x_pt = torch.randn(3).to(torch_dtype).cuda()
+
+        expected_bytes = x_pt.numel() * x_pt.element_size()
+        self.assertEqual(x.size_bytes(), expected_bytes)
+
+        mod = compile_model(x, self.target, "./tmp", f"test_tensor_size_{dtype}")
+
+        out = torch.empty_like(x_pt)
+        mod.run_with_tensors([x_pt], [out])
+        self.assertTrue(torch.equal(out, x_pt))
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_transform_memory_ops.py b/tests/unittest/compiler/test_transform_memory_ops.py
index 4e3eebf54..e96718d74 100644
--- a/tests/unittest/compiler/test_transform_memory_ops.py
+++ b/tests/unittest/compiler/test_transform_memory_ops.py
@@ -17,9 +17,16 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops, transform
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import graph_utils
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils import graph_utils, shape_utils
+
+from parameterized import parameterized
 
 
 class MemoryOpTransformationTestCase(unittest.TestCase):
@@ -28,8 +35,11 @@ class MemoryOpTransformationTestCase(unittest.TestCase):
     N = 128
     USE_DYNAMIC_BATCH = False
 
-    def _prepare_cat_elimination_graph(self):
-        dtype = "float16"
+    def __init__(self, *args, **kwargs):
+        super(MemoryOpTransformationTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _prepare_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -58,20 +68,22 @@ def test_cat_elimination_graph_transformation(self):
         graph = transform.transform_memory_ops(graph)
         self.assertEqual(len(graph), 2)
 
-    def test_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"cat_elimination_{dtype}")
 
-        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         out_pt = torch.cat([x0_pt, x0_pt], dim=1)
 
-        out = torch.empty(out_pt.size()).cuda().half()
+        out = get_torch_empty_tensor(out_pt.size(), dtype)
         module.run_with_tensors([x0_pt], [out])
         self.assertTrue(torch.allclose(out_pt, out, atol=1e-1, rtol=1e-2))
 
-    def _prepare_split_cat_elimination_graph(self):
-        dtype = "float16"
+    def _prepare_split_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -131,20 +143,25 @@ def test_split_cat_elimination_graph_transformation(self):
         graph = transform.transform_memory_ops(graph)
         self.assertEqual(len(graph), 7)
 
-    def test_split_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_split_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_split_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "split_cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_split_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"split_cat_elimination_{dtype}"
+        )
 
-        x0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         x4_pt, x5_pt = torch.split(x0_pt, int(self.N / 2), dim=2)
         out_pt0 = torch.cat([x4_pt, x5_pt], dim=1)
-        y0_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
-        y1_pt = torch.randn([self.BATCH_SIZE, self.M, self.N]).cuda().half()
+        y0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        y1_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
         out_pt1 = torch.cat([y1_pt, y0_pt, y0_pt], dim=1)
 
-        out0 = torch.empty(out_pt0.size()).cuda().half()
-        out1 = torch.empty(out_pt1.size()).cuda().half()
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
         module.run_with_tensors(
             {"input0": x0_pt, "input1": y0_pt, "input2": y1_pt},
             {"output0": out0, "output1": out1},
@@ -152,8 +169,7 @@ def test_split_cat_elimination_e2e(self):
         self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
         self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
 
-    def _prepare_cat_cat_elimination_graph(self):
-        dtype = "float16"
+    def _prepare_cat_cat_elimination_graph(self, dtype="float16"):
         X0 = Tensor(
             shape=[
                 IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
@@ -223,25 +239,861 @@ def test_cat_cat_elimination_graph_transformation(self):
         self.assertEqual(len(graph), 6)
         self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 2)
 
-    def test_cat_cat_elimination_e2e(self):
-        OUTPUT = self._prepare_cat_cat_elimination_graph()
+    @parameterized.expand([("float16"), ("float")])
+    def test_cat_cat_elimination_e2e(self, dtype):
         target = detect_target()
-        module = compile_model(OUTPUT, target, "./tmp", "cat_cat_elimination")
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_cat_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"cat_cat_elimination_{dtype}")
 
-        x0_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
-        x1_pt = torch.randn([self.BATCH_SIZE, int(self.M / 2), self.N]).cuda().half()
-        x2_pt = torch.randn([self.BATCH_SIZE, self.M, self.N + 4]).cuda().half()
-        x3_pt = torch.randn([self.BATCH_SIZE, self.M, self.N * 2]).cuda().half()
+        x0_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x1_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x2_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N + 4], dtype)
+        x3_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N * 2], dtype)
         x5_pt = torch.cat([x0_pt, x1_pt], dim=1)
         out_pt0 = torch.cat([x3_pt, x5_pt, x2_pt, x2_pt], dim=2)
 
-        out0 = torch.empty(out_pt0.size()).cuda().half()
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
         module.run_with_tensors(
             {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt, "input3": x3_pt},
             [out0],
         )
         self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
 
+    def _prepare_skip_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = ops.concatenate()([X0], dim=1)
+        X2 = ops.concatenate()([X1], dim=2)
+        X3 = ops.concatenate()([X2, X1], dim=1)
+        X1._attrs["name"] = "output0"
+        X1._attrs["is_output"] = True
+        X3._attrs["name"] = "output1"
+        X3._attrs["is_output"] = True
+        return X1, X3
+
+    def test_skip_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 4)
+        graph = transform.transform_memory_ops(graph)
+        print(graph)
+        self.assertEqual(len(graph), 3)
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_skip_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_cat_elimination_graph(dtype)
+        module = compile_model(OUTPUT, target, "./tmp", f"skip_cat_elimination_{dtype}")
+
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        out0_pt = torch.cat([x0_pt], dim=1)
+        out1_pt = torch.cat([x0_pt, x0_pt], dim=1)
+
+        out0 = get_torch_empty_tensor(out0_pt.size(), dtype)
+        out1 = get_torch_empty_tensor(out1_pt.size(), dtype)
+        module.run_with_tensors([x0_pt], [out0, out1])
+        self.assertTrue(torch.allclose(out0_pt, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out1_pt, out1, atol=1e-1, rtol=1e-2))
+
+    def _prepare_skip_split_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        [X1, X2] = ops.split()(X0, int(self.M / 2), dim=1)
+        X3 = ops.concatenate()([X1, X2], dim=1)
+        [X4, X5] = ops.split()(X3, int(self.N / 2), dim=2)
+        X6 = ops.concatenate()([X4, X5], dim=1)
+        X3._attrs["name"] = "output0"
+        X3._attrs["is_output"] = True
+        X6._attrs["name"] = "output1"
+        X6._attrs["is_output"] = True
+
+        return [X3, X6]
+
+    def test_skip_split_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_split_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 7)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 5)
+
+    @parameterized.expand([("float16")])  # , ("float")])
+    def test_skip_split_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_split_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"skip_split_cat_elimination_{dtype}"
+        )
+
+        x0_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N], dtype)
+        out_pt0 = x0_pt
+        x4_pt, x5_pt = torch.split(x0_pt, int(self.N / 2), dim=2)
+        out_pt1 = torch.cat([x4_pt, x5_pt], dim=1)
+
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
+        module.run_with_tensors(
+            {"input0": x0_pt},
+            {"output0": out0, "output1": out1},
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
+
+    def _prepare_skip_cat_cat_elimination_graph(self, dtype="float16"):
+        X0 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch0")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch1")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=int(self.M / 2)),
+                IntImm(value=self.N),
+            ],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch2")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N + 4),
+            ],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[
+                IntVar(values=[1, self.BATCH_SIZE], name="input_batch3")
+                if self.USE_DYNAMIC_BATCH
+                else IntImm(value=self.BATCH_SIZE),
+                IntImm(value=self.M),
+                IntImm(value=self.N * 2),
+            ],
+            dtype=dtype,
+            name="input3",
+            is_input=True,
+        )
+
+        X5 = ops.concatenate()([X0, X1], dim=1)
+        X6 = ops.concatenate()([X5, X2], dim=2)
+        X7 = ops.concatenate()([X3, X6], dim=2)
+        X8 = ops.concatenate()([X7, X2], dim=2)
+        X6._attrs["name"] = "output0"
+        X6._attrs["is_output"] = True
+        X8._attrs["name"] = "output1"
+        X8._attrs["is_output"] = True
+
+        return [X6, X8]
+
+    def test_skip_cat_cat_elimination_graph_transformation(self):
+        OUTPUT = self._prepare_skip_cat_cat_elimination_graph()
+        graph = transform.toposort(OUTPUT)
+        transform.name_graph(graph)
+        transform.mark_param_tensor(graph)
+        self.assertEqual(len(graph), 8)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 4)
+        graph = transform.transform_memory_ops(graph)
+        self.assertEqual(len(graph), 7)
+        self.assertEqual(len(graph_utils.get_sorted_ops(graph)), 3)
+
+    @parameterized.expand([("float16"), ("float")])
+    def test_skip_cat_cat_elimination_e2e(self, dtype):
+        target = detect_target()
+        if dtype == "float" and target.name == "rocm":
+            self.skipTest("float tensors not supported by ROCM")
+        OUTPUT = self._prepare_skip_cat_cat_elimination_graph(dtype)
+        module = compile_model(
+            OUTPUT, target, "./tmp", f"skip_cat_cat_elimination_{dtype}"
+        )
+
+        x0_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x1_pt = get_random_torch_tensor(
+            [self.BATCH_SIZE, int(self.M / 2), self.N], dtype
+        )
+        x2_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N + 4], dtype)
+        x3_pt = get_random_torch_tensor([self.BATCH_SIZE, self.M, self.N * 2], dtype)
+        x5_pt = torch.cat([x0_pt, x1_pt], dim=1)
+        out_pt0 = torch.cat([x5_pt, x2_pt], dim=2)
+        out_pt1 = torch.cat([x3_pt, x5_pt, x2_pt, x2_pt], dim=2)
+
+        out0 = get_torch_empty_tensor(out_pt0.size(), dtype)
+        out1 = get_torch_empty_tensor(out_pt1.size(), dtype)
+        module.run_with_tensors(
+            {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt, "input3": x3_pt},
+            [out0, out1],
+        )
+        self.assertTrue(torch.allclose(out_pt0, out0, atol=1e-1, rtol=1e-2))
+        self.assertTrue(torch.allclose(out_pt1, out1, atol=1e-1, rtol=1e-2))
+
+    def _test_fuse_strided_cat_cat(self, M0, M1, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # add_1 = add(concat_0, x2)
+        # concat_2 = concatenate(x0, concat_0)
+        # reduce_3 = reduce_sum(add_1)
+        # reduce_4 = reduce_sum(concat_2)
+        # y = add(reduce_3, reduce_4)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        add_1 = ops.elementwise(FuncEnum.ADD)(concat_0, X2)
+        concat_2 = ops.concatenate()([X0, concat_0], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_3 = ops.reduce_sum(reduce_dim)(add_1)
+        reduce_4 = ops.reduce_sum(reduce_dim)(concat_2)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_3, reduce_4)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            add_1_pt = concat_0_pt + x2_pt
+            concat_2_pt = torch.cat([x0_pt, concat_0_pt], dim=cat_dim)
+            reduce_3_pt = torch.sum(add_1_pt, reduce_dim)
+            reduce_4_pt = torch.sum(concat_2_pt, reduce_dim)
+            y_pt = reduce_3_pt + reduce_4_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_cat(self):
+        self._test_fuse_strided_cat_cat(
+            M0=3,
+            M1=4,
+            N=9,
+            test_name="test_fuse_strided_cat_cat",
+        )
+        self._test_fuse_strided_cat_cat(
+            M0=2,
+            M1=4,
+            N=8,
+            test_name="test_fuse_strided_cat_cat",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat(
+        self, M0, M1, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # reshape_1 = reshape(concat_0)
+        # add_2 = add(reshape_1, x2)
+        # concat_3 = concatenate(x0, reshape_1)
+        # reduce_4 = reduce_sum(add_2)
+        # reduce_5 = reduce_sum(concat_3)
+        # y = add(reduce_4, reduce_5)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        M2 = M0 + M1
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3 * N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        reshape_to_shape_1 = [-1, M2 * N]
+        reshape_1 = ops.reshape()(concat_0, reshape_to_shape_1)
+        add_2 = ops.elementwise(FuncEnum.ADD)(reshape_1, X2)
+        concat_3 = ops.concatenate()([X3, reshape_1], dim=cat_dim)
+        reduce_dim = cat_dim
+        reduce_4 = ops.reduce_sum(reduce_dim)(add_2)
+        reduce_5 = ops.reduce_sum(reduce_dim)(concat_3)
+        Y = ops.elementwise(FuncEnum.ADD)(reduce_4, reduce_5)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 5)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3 * N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            reshape_1_pt = torch.reshape(concat_0_pt, reshape_to_shape_1)
+            add_2_pt = reshape_1_pt + x2_pt
+            concat_3_pt = torch.cat([x3_pt, reshape_1_pt], dim=cat_dim)
+            reduce_4_pt = torch.sum(add_2_pt, reduce_dim)
+            reduce_5_pt = torch.sum(concat_3_pt, reduce_dim)
+            y_pt = reduce_4_pt + reduce_5_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat(self):
+        self._test_fuse_strided_cat_reshape_cat(
+            M0=2,
+            M1=4,
+            M3=3,
+            N=8,
+            test_name="test_fuse_strided_cat_reshape_cat",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat_2(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # add_3 = add(reshape_2, x4) # 3d
+        # concat_4 = concatenate(x3, reshape_2, x3) # 3d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(reshape_2, X4)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(add_3, [-1, (M0 + M2) * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = reshape_2_pt + x4_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(add_3_pt, [-1, (M0 + M2) * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x6": x6_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat_2(self):
+        self._test_fuse_strided_cat_reshape_cat_2(
+            M0=2,
+            M1=2,
+            M2=2,
+            M3=1,
+            N=2,
+            test_name="test_fuse_strided_cat_reshape_cat_2",
+        )
+
+    def _test_fuse_strided_cat_reshape_cat_3(
+        self, M0, M1, M2, M3, N, test_name, dtype="float16"
+    ):
+        # make a graph like below:
+        # add_0 = add(x0, x1)  # 2d
+        # concat_1 = concatenate(add_0, x2) # 2d
+        # reshape_2 = reshape(concat_1) # 3d
+        # add_3 = add(reshape_2, x4) # 3d
+        # concat_4 = concatenate(x3, concat_1, x3) # 2d
+        # reshape_5 = reshape(concat_4) # 2d
+        # add_6 = add(reshape_5, x6) # 2d
+        # concat_7 = concatenate(x0, reshape_5, x0)
+        # reshape_8 = reshape(add_3) # 2d
+        # reduce_9 = reduce_sum(reshape_8)
+        # reduce_10 = reduce_sum(add_6)
+        # reduce_11 = reduce_sum(concat_7)
+        # add_12 = add(reduce_9, reduce_10)
+        # y = add(add_12, reduce_11)
+        assert M0 == M1, f"expected {M0=} to be equal to {M1=}"
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0 * N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M1 * N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M2 * N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        X3 = Tensor(
+            shape=[batch_dim, IntImm(M3), IntImm(N)],
+            dtype=dtype,
+            name="x3",
+            is_input=True,
+        )
+        M4 = M0 + M2
+        X4 = Tensor(
+            shape=[batch_dim, IntImm(M4), IntImm(N)],
+            dtype=dtype,
+            name="x4",
+            is_input=True,
+        )
+        cat_dim = 1
+        add_0 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        concat_1 = ops.concatenate()([add_0, X2], dim=cat_dim)
+        reshape_2 = ops.reshape()(concat_1, [-1, M0 + M2, N])
+        add_3 = ops.elementwise(FuncEnum.ADD)(reshape_2, X4)
+        concat_4 = ops.concatenate()([X3, reshape_2, X3], dim=cat_dim)  # 3d
+        reshape_to_shape_5 = (
+            sum([t.shape()[cat_dim].value() for t in [X3, reshape_2, X3]]) * N
+        )
+        reshape_5 = ops.reshape()(concat_4, [-1, reshape_to_shape_5])  # 2d
+        X6 = Tensor(
+            shape=[batch_dim, IntImm(reshape_to_shape_5)],
+            dtype=dtype,
+            name="x6",
+            is_input=True,
+        )
+        add_6 = ops.elementwise(FuncEnum.ADD)(reshape_5, X6)
+        concat_7 = ops.concatenate()([X0, reshape_5, X0], dim=cat_dim)  # 2d
+        reshape_8 = ops.reshape()(add_3, [-1, (M0 + M2) * N])  # 2d
+        reduce_dim = cat_dim
+        reduce_9 = ops.reduce_sum(reduce_dim)(reshape_8)
+        reduce_10 = ops.reduce_sum(reduce_dim)(add_6)
+        reduce_11 = ops.reduce_sum(reduce_dim)(concat_7)
+        add_12 = ops.elementwise(FuncEnum.ADD)(reduce_9, reduce_10)
+        Y = ops.elementwise(FuncEnum.ADD)(add_12, reduce_11)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 8)
+        concat_cnt = 0
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            # dynamic_slice is fused into add
+            self.assertTrue(op_type != "dynamic_slice")
+            if op_type == "concatenate":
+                concat_cnt += 1
+        self.assertEqual(concat_cnt, 1)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0 * N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M1 * N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M2 * N], dtype)
+            x3_pt = get_random_torch_tensor([batch, M3, N], dtype)
+            x4_pt = get_random_torch_tensor([batch, M4, N], dtype)
+            x6_pt = get_random_torch_tensor([batch, reshape_to_shape_5], dtype)
+            add_0_pt = x0_pt + x1_pt
+            concat_1_pt = torch.cat([add_0_pt, x2_pt], dim=cat_dim)
+            reshape_2_pt = torch.reshape(concat_1_pt, [-1, M0 + M2, N])
+            add_3_pt = reshape_2_pt + x4_pt
+            concat_4_pt = torch.cat([x3_pt, reshape_2_pt, x3_pt], dim=cat_dim)
+            reshape_5_pt = torch.reshape(concat_4_pt, [-1, reshape_to_shape_5])
+            add_6_pt = reshape_5_pt + x6_pt
+            concat_7_pt = torch.cat([x0_pt, reshape_5_pt, x0_pt], dim=cat_dim)
+            reshape_8_pt = torch.reshape(add_3_pt, [-1, (M0 + M2) * N])
+            reduce_9_pt = torch.sum(reshape_8_pt, reduce_dim)
+            reduce_10_pt = torch.sum(add_6_pt, reduce_dim)
+            reduce_11_pt = torch.sum(concat_7_pt, reduce_dim)
+            add_12_pt = reduce_9_pt + reduce_10_pt
+            y_pt = add_12_pt + reduce_11_pt
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+                "x3": x3_pt,
+                "x4": x4_pt,
+                "x6": x6_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_fuse_strided_cat_reshape_cat_3(self):
+        self._test_fuse_strided_cat_reshape_cat_3(
+            M0=2,
+            M1=2,
+            M2=2,
+            M3=1,
+            N=2,
+            test_name="test_fuse_strided_cat_reshape_cat_3",
+        )
+
+    def _test_non_fusible_strided_cat_cat(self, M0, N, test_name, dtype="float16"):
+        # make a graph like below:
+        # concat_0 = concatenate(x0, x1)
+        # add_1 = add(concat_0, x2)
+        # y = concatenate(concat_0, add_1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(M0), IntImm(N)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(M0 + M0), IntImm(N)],
+            dtype=dtype,
+            name="x2",
+            is_input=True,
+        )
+        cat_dim = 1
+        concat_0 = ops.concatenate()([X0, X1], dim=cat_dim)
+        add_1 = ops.elementwise(FuncEnum.ADD)(concat_0, X2)
+        Y = ops.concatenate()([concat_0, add_1], dim=cat_dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        # Gen module.
+        target = detect_target()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model([Y], target, "./tmp", test_name, dll_name=dll_name)
+        self.test_count += 1
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        self.assertEqual(len(sorted_ops), 3)
+        concat_cnt = 0
+        output_cat = None
+        for sorted_op in sorted_ops:
+            op_type = sorted_op._attrs["op"]
+            if op_type == "concatenate":
+                concat_cnt += 1
+                if sorted_op._attrs["outputs"][0] == Y:
+                    output_cat = sorted_op
+        self.assertEqual(concat_cnt, 2)
+        self.assertEqual(output_cat._attrs["input_masks"], [True, False])
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x1_pt = get_random_torch_tensor([batch, M0, N], dtype)
+            x2_pt = get_random_torch_tensor([batch, M0 + M0, N], dtype)
+
+            concat_0_pt = torch.cat([x0_pt, x1_pt], dim=cat_dim)
+            add_1_pt = concat_0_pt + x2_pt
+            y_pt = torch.cat([concat_0_pt, add_1_pt], dim=cat_dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+                "x2": x2_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.1, rtol=0.1)
+
+    def test_non_fusible_strided_cat_cat(self):
+        self._test_non_fusible_strided_cat_cat(
+            M0=2,
+            N=8,
+            test_name="test_non_fusible_strided_cat_cat",
+        )
+
+    def _test_non_fusible_split_reshape_cat(self, M, test_name, dtype="float16"):
+        # make the following graph
+        # split_0, split_1 = split(x0)
+        # unsqueeze_2 = unsqueeze(dim=1)(split_0)
+        # unsqueeze_3 = unsqueeze(dim=1)(split_1)
+        # add_4 = add(x1, x1)
+        # y = concat([unsqueeze_2, unsqueeze_3, add_4], dim=1)
+        batch_sizes = [1, self.BATCH_SIZE]
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
+        assert M % 2 == 0, f"expected {M=} % 2 == 0"
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(M)],
+            dtype=dtype,
+            name="x0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(2), IntImm(M // 2)],
+            dtype=dtype,
+            name="x1",
+            is_input=True,
+        )
+        dim = 1
+        split_0, split_1 = ops.split()(X0, [M // 2, M // 2], dim=dim)
+        unsqueeze_2 = ops.unsqueeze(dim=dim)(split_0)
+        unsqueeze_3 = ops.unsqueeze(dim=dim)(split_1)
+        add_4 = ops.elementwise(FuncEnum.ADD)(X1, X1)
+        Y = ops.concatenate()([unsqueeze_2, unsqueeze_3, add_4], dim=dim)
+        Y._attrs["name"] = "output0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", test_name)
+        sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
+        # fuse_split allows optimizing split+cat to just split, leaving 2 ops
+        self.assertEqual(len(sorted_ops), 2)
+
+        for batch in [1, self.BATCH_SIZE]:
+            x0_pt = get_random_torch_tensor([batch, M], dtype)
+            x1_pt = get_random_torch_tensor([batch, 2, M // 2], dtype)
+
+            split_0_pt, split_1_pt = torch.split(x0_pt, [M // 2, M // 2], dim=dim)
+            unsqueeze_2_pt = torch.unsqueeze(split_0_pt, dim)
+            unsqueeze_3_pt = torch.unsqueeze(split_1_pt, dim)
+            add_4_pt = x1_pt + x1_pt
+            y_pt = torch.cat([unsqueeze_2_pt, unsqueeze_3_pt, add_4_pt], dim=dim)
+
+            y = get_torch_empty_tensor(y_pt.size(), dtype)
+            inputs = {
+                "x0": x0_pt,
+                "x1": x1_pt,
+            }
+            outputs = [y]
+            module.run_with_tensors(inputs, outputs)
+            torch.testing.assert_close(y_pt, y, atol=0.01, rtol=0.01)
+
+    def test_non_fusible_split_reshape_cat(self):
+        self._test_non_fusible_split_reshape_cat(
+            M=32,
+            test_name="test_non_fusible_split_reshape_cat",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/compiler/test_transform_odd_alignment.py b/tests/unittest/compiler/test_transform_odd_alignment.py
index 74f87e2ee..11ce33836 100644
--- a/tests/unittest/compiler/test_transform_odd_alignment.py
+++ b/tests/unittest/compiler/test_transform_odd_alignment.py
@@ -31,6 +31,10 @@ def _extract_shape(batch, shape):
 
 
 class TransformOddAlignmentCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _create_permute_bmm_graph(
         self, A_shape, B_shape, bmm_type, const_A=None, const_B=None
     ):
@@ -72,6 +76,7 @@ def _test_permute_bmm_A(
         is_const,
         is_elementwise=False,
         strided_output=True,
+        test_prefix="",
     ):
         M = shape_A[-2] if origin_bmm[-3] == "r" else shape_A[-1]
         N = shape_B[-1] if origin_bmm[-2] == "r" else shape_B[-2]
@@ -107,7 +112,7 @@ def _test_permute_bmm_A(
                 output,
                 target,
                 "./tmp",
-                f"alignment_permute_bmm_A_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
+                f"{test_prefix}alignment_permute_bmm_A_{b}_{origin_bmm}_to_{target_bmm}_{is_const}",
             )
 
             exist_new_bmm = False
@@ -184,6 +189,7 @@ def test_permute_bmm_A(self):
             "bmm_rrr",
             "bmm_crr",
             is_const=True,
+            test_prefix="2d_broadcast_",
         )
         # non-const input misaligned on K, permute.
         self._test_permute_bmm_A(
diff --git a/tests/unittest/compiler/test_transform_permute_to_reshape.py b/tests/unittest/compiler/test_transform_permute_to_reshape.py
new file mode 100644
index 000000000..46f8b28d6
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_permute_to_reshape.py
@@ -0,0 +1,142 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import re
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+
+from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from parameterized import parameterized
+
+_PERMUTE_OPS = (
+    "permute",
+    "permute021",
+    "permute102",
+    "permute210",
+    "permute0213",
+)
+
+
+def _generate_model_name(shape, permutation, is_reshape, dtype, is_complex):
+    model_name = "_".join(
+        [
+            ("test_permute_complex" if is_complex else "test_permute"),
+            ("to_reshape" if is_reshape else "not_to_reshape"),
+            "x".join([str(s) for s in shape]),  #  these  can contain characters
+            "".join([str(s) for s in permutation]),  #  unsafe for usage in filenames
+            dtype,
+        ]
+    )
+    # replace non-alphanumeric characters with underscores
+    # The ^ within the [^a-zA-Z0-9_] is a negation of the
+    # character class so it matches every character not in that class,
+    model_name = re.sub(r"[^a-zA-Z0-9_]", "_", model_name)
+    return model_name
+
+
+class TransformPermuteToReshapeTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            # no singleton
+            ([32, 51, 12], [1, 2, 0], False, False, "float16"),
+            ([32, 51, 12], [1, 2, 0], False, False, "float32"),
+            # one singleton dimension
+            ([32, 51, 1], [0, 2, 1], True, False, "float16"),
+            ([32, 51, 1], [0, 2, 1], True, False, "float32"),
+            ([32, 51, 1], [1, 2, 0], False, False, "float16"),
+            ([32, 51, 1], [0, 2, 1], True, True, "float16"),
+            ([32, 51, 1], [1, 2, 0], False, True, "float16"),
+            # two same sized dimensions
+            ([32, 32, 1], [2, 0, 1], True, False, "float16"),
+            ([32, 32, 1], [1, 0, 2], False, False, "float16"),
+            # double singleton dimension
+            ([32, 1, 51, 1], [3, 0, 2, 1], True, False, "float16"),
+            ([32, 1, 51, 1], [2, 3, 1, 0], False, False, "float16"),
+            # IntVar dimension
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, False, "float16"),
+            ([IntVar([1, 10]), 32, 51, 1], [0, 1, 3, 2], True, True, "float16"),
+            ([IntVar([1, 10]), 32, 1, 51], [0, 2, 1, 3], True, False, "float32"),
+            ([IntVar([1, 10]), 32, 1, 51], [2, 3, 0, 1], False, False, "float16"),
+            # other
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, False, "float16"),
+            ([3, 1, 113, 15, 64], [0, 1, 2, 4, 3], False, False, "float32"),
+        ]
+    )
+    def test_permute_to_reshape(
+        self,
+        shape: List[int],
+        permutation: List[int],
+        is_reshape: bool,
+        squeeze_trailing_dim: bool,
+        dtype: str,
+    ):
+        target = detect_target()
+
+        if squeeze_trailing_dim:
+            # Simulate situation when the rank of the input tensor doesn't
+            # match the permutation length, and transform_permute_to_reshape
+            # needs to take into account the original shape of the
+            # corresponsing tensor accessor. This could happen after fusion of
+            # permute and view op by transform_strided_ops pass.
+            # We test it by providing an input tensor with last dimension 1 and
+            # unsqueezing it before passing to permute
+            assert shape[-1] == 1
+            X0 = Tensor(shape[:-1], dtype=dtype, is_input=True, name="x")
+            X = ops.unsqueeze(len(shape) - 1)(X0)
+        else:
+            X = Tensor(shape, dtype=dtype, is_input=True, name="x")
+        Z = ops.softmax()(ops.permute()(X, dims=permutation), -1)
+        Z._attrs["is_output"] = True
+        Z._attrs["name"] = "z"
+
+        model_name = _generate_model_name(
+            shape, permutation, is_reshape, dtype, is_complex=False
+        )
+        module = compile_model(Z, target, "./tmp", model_name)
+        has_permute_op = any(
+            test_utils.graph_has_op(module.debug_sorted_graph, op_name)
+            for op_name in _PERMUTE_OPS
+        )
+        has_reshape_op = test_utils.graph_has_op(module.debug_sorted_graph, "reshape")
+
+        if is_reshape:
+            self.assertFalse(has_permute_op)
+            self.assertTrue(has_reshape_op)
+        else:
+            self.assertTrue(has_permute_op)
+            self.assertFalse(has_reshape_op)
+
+        shape = [dim.upper_bound() if isinstance(dim, IntVar) else dim for dim in shape]
+
+        x_pt = get_random_torch_tensor(shape, dtype)
+        z_pt = torch.softmax(torch.permute(x_pt, tuple(permutation)), dim=-1)
+        z_ait = torch.empty_like(z_pt)
+        if squeeze_trailing_dim:
+            # Same as what we did with AIT input tensor X above
+            x_pt = x_pt.squeeze(-1)
+        module.run_with_tensors({"x": x_pt}, {"z": z_ait})
+
+        torch.testing.assert_close(z_ait, z_pt, atol=1e-1, rtol=1e-1)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/compiler/test_transform_special_op.py b/tests/unittest/compiler/test_transform_special_op.py
index 9e5efcaba..68e385262 100644
--- a/tests/unittest/compiler/test_transform_special_op.py
+++ b/tests/unittest/compiler/test_transform_special_op.py
@@ -22,14 +22,22 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
 from aitemplate.utils.graph_utils import get_sorted_ops
 
+from parameterized import parameterized
+
 
 class GemmRrrSmallNkTestCase(unittest.TestCase):
-    def _create_gemm_rrr_graph(self, M, K, N):
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+    def _create_gemm_rrr_graph(self, M, K, N, dtype):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "gemm_rrr_tensor"
@@ -37,15 +45,15 @@ def _create_gemm_rrr_graph(self, M, K, N):
 
         return X, W, Y
 
-    def _test_small_nk(self, Ms, N, K, testname=None):
+    def _test_small_nk(self, Ms, N, K, testname=None, dtype="float16"):
         if testname is None:
-            testname = "gemm_rrr_small_nk_{}_{}_{}".format(Ms, N, K)
+            testname = f"gemm_rrr_small_nk_{Ms}_{N}_{K}_{dtype}"
             testname = testname.replace(" ", "")
             testname = testname.replace("[", "")
             testname = testname.replace("]", "")
 
         X, W, gemm_tensor = self._create_gemm_rrr_graph(
-            shape_utils.gen_int_var_min_max(Ms), K, N
+            shape_utils.gen_int_var_min_max(Ms), K, N, dtype
         )
 
         output = ops.elementwise(FuncEnum.COS)(gemm_tensor)
@@ -77,12 +85,12 @@ def _test_small_nk(self, Ms, N, K, testname=None):
         )
 
         for m in Ms:
-            X_pt = torch.randn(m, K).cuda().half()
-            W_pt = torch.randn(K, N).cuda().half()
+            X_pt = get_random_torch_tensor([m, K], dtype)
+            W_pt = get_random_torch_tensor([K, N], dtype)
             mm_pt = torch.matmul(X_pt, W_pt)
             Y_pt = torch.cos(mm_pt)
-            y = torch.empty([m, N]).cuda().half()
-            gemm_tensor_pt = torch.empty([m, N]).cuda().half()
+            y = get_torch_empty_tensor([m, N], dtype)
+            gemm_tensor_pt = get_torch_empty_tensor([m, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt},
                 {"output_0": y, "gemm_rrr_tensor": gemm_tensor_pt},
@@ -104,13 +112,33 @@ def test_small_nk_alignment(self):
         self._test_small_nk([100, 200], 6, 3)
         self._test_small_nk([105], 7, 1)
 
-    def test_small_nk_no_transform(self):
-        M, K, N = 8, 8, 16
-        _, _, output = self._create_gemm_rrr_graph(M, K, N)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_small_nk_fp32(self):
+        self._test_small_nk([10], 8, 4, "test_small_nk_fp32", dtype="float32")
+        self._test_small_nk(
+            [10, 30, 50], 6, 4, "test_small_kn_dynamic1_fp32", dtype="float32"
+        )
+        self._test_small_nk(
+            [100, 200], 6, 3, "test_small_nk_alignment_fp32", dtype="float32"
+        )
 
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_small_nk_no_transform(self, dtype):
         target = detect_target()
+
+        M, K, N = 8, 8, 16
+        _, _, output = self._create_gemm_rrr_graph(M, K, N, dtype)
+
         module = compile_model(
-            output, target, "./tmp", "test_small_nk_fail_{}_{}_{}".format(M, K, N)
+            output, target, "./tmp", f"test_small_nk_fail_{M}_{K}_{N}_{dtype}"
         )
 
         for tensor in module.debug_sorted_graph:
@@ -130,36 +158,38 @@ def test_small_nk_no_transform(self):
         src_op = list(output_tensor._attrs["src_ops"])[0]
         self.assertEqual(src_op._attrs["op"], "gemm_rrr", "output op type incorrect")
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(K, N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([K, N], dtype)
         Y_pt = torch.matmul(X_pt, W_pt)
 
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
 class BmmRcrN1TestCase(unittest.TestCase):
-    def _create_bmm_rcr_graph(self, B, M, N, K):
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+    def _create_bmm_rcr_graph(self, B, M, N, K, dtype):
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "bmm_rcr_tensor"
 
         return X, W, Y
 
-    def _test_n1_k8(self, B, M, N, K, testname=None):
+    def _test_n1_k8(self, B, M, N, K, testname=None, dtype="float16"):
         if testname is None:
-            testname = "bmm_rcr_n1_{}_{}_{}_{}".format(B, M, N, K)
+            testname = f"bmm_rcr_n1_{B}_{M}_{N}_{K}_{dtype}"
             testname = testname.replace(" ", "")
             testname = testname.replace("[", "")
             testname = testname.replace("]", "")
 
         X, W, bmm_tensor = self._create_bmm_rcr_graph(
-            B, shape_utils.gen_int_var_min_max(M), N, K
+            B, shape_utils.gen_int_var_min_max(M), N, K, dtype
+        )
+        mul = ops.elementwise(FuncEnum.MUL)(
+            bmm_tensor, Tensor(shape=[], dtype=dtype, value=1.0)
         )
-        mul = ops.elementwise(FuncEnum.MUL)(bmm_tensor, Tensor(shape=[], value=1.0))
         output = ops.elementwise(FuncEnum.COS)(mul)
         output._attrs["name"] = "output_0"
         output._attrs["is_output"] = True
@@ -180,8 +210,8 @@ def _test_n1_k8(self, B, M, N, K, testname=None):
         assert src_op._attrs["op"] == "bmm_rcr_n1"
 
         for m in M:
-            X_pt = torch.randn(B, m, K).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([B, m, K], dtype)
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
 
             def pt_bmm(X_pt, W_pt):
                 WT = torch.transpose(W_pt, 2, 1)
@@ -190,7 +220,7 @@ def pt_bmm(X_pt, W_pt):
 
             Y_pt = torch.cos(pt_bmm(X_pt, W_pt))
 
-            y = torch.empty([B, m, N]).cuda().half()
+            y = get_torch_empty_tensor([B, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -201,13 +231,28 @@ def test_n1_k8(self):
     def test_n1_k8_dynamic(self):
         self._test_n1_k8(10, [8, 16], 1, 8)
 
-    def test_n_non1_fail(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_n1_k8_fp32(self):
+        self._test_n1_k8(10, [8], 1, 8, dtype="float32")
+        self._test_n1_k8(10, [8, 16], 1, 8, dtype="float32")
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_n_non1_fail(self, dtype):
+        target = detect_target()
+
         B, M, K, N = 8, 8, 8, 8
-        _, _, output = self._create_bmm_rcr_graph(B, M, K, N)
+        _, _, output = self._create_bmm_rcr_graph(B, M, K, N, dtype)
         output._attrs["is_output"] = True
 
-        target = detect_target()
-        module = compile_model(output, target, "./tmp", "bmm_rcr_n_non1")
+        module = compile_model(output, target, "./tmp", f"bmm_rcr_n_non1_{dtype}")
 
         output_tensor = None
         for tensor in module.debug_sorted_graph:
@@ -221,7 +266,6 @@ def test_n_non1_fail(self):
         self.assertEqual(src_op._attrs["op"], "bmm_rcr")
 
 
-@unittest.skip("enable it when ck fix")
 class OneByOneConvTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -238,18 +282,20 @@ def _assert_has_gemm(self, sorted_graph: List[Tensor]):
         raise AssertionError("Did not find gemm_rcr in graph")
 
     def _test_simple_1x1_conv(
-        self, batch, CO, HH, WW, CI, activation=None, with_bias=False
+        self, batch, CO, HH, WW, CI, activation=None, with_bias=False, dtype="float16"
     ):
         if isinstance(batch, int):
             batch = (batch,)
         batch_var = shape_utils.gen_int_var_min_max(batch, name="batch_size")
         X = Tensor(
             shape=[batch_var, HH, WW, CI],
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[CO, 1, 1, CI],
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -257,6 +303,7 @@ def _test_simple_1x1_conv(
         if with_bias:
             bias = Tensor(
                 shape=[CO],
+                dtype=dtype,
                 name="bias",
                 is_input=True,
             )
@@ -293,11 +340,11 @@ def _test_simple_1x1_conv(
             self._assert_has_gemm(module.debug_sorted_graph)
 
             for batch_pt in batch:
-                X_pt = torch.randn(batch_pt, CI, HH, WW).half().cuda()
-                W_pt = torch.randn(CO, CI, 1, 1).half().cuda()
+                X_pt = get_random_torch_tensor([batch_pt, CI, HH, WW], dtype)
+                W_pt = get_random_torch_tensor([CO, CI, 1, 1], dtype)
 
                 if with_bias:
-                    B_pt = torch.randn(CO).half().cuda()
+                    B_pt = get_random_torch_tensor([CO], dtype)
                 else:
                     B_pt = None
 
@@ -314,7 +361,7 @@ def _test_simple_1x1_conv(
                 elif activation is not None:
                     raise NotImplementedError(f"Unsupported activation {activation}")
 
-                Y_ait = torch.empty(batch_pt, HH, WW, CO).half().cuda()
+                Y_ait = get_torch_empty_tensor(batch_pt, HH, WW, CO, dtype)
                 inputs = {
                     "input_0": X_pt.permute(0, 2, 3, 1).contiguous(),
                     "input_1": W_pt.permute(0, 2, 3, 1).contiguous(),
@@ -328,38 +375,41 @@ def _test_simple_1x1_conv(
                     Y_pt, Y_ait.permute(0, 3, 1, 2), atol=1e-1, rtol=1e-1
                 )
 
-    def test_1x1_conv_no_bias(self):
-        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2)
-        self._test_simple_1x1_conv(
-            batch=3, CO=100, HH=200, WW=4, CI=2, activation="relu"
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid"
-        )
-        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13)
-        self._test_simple_1x1_conv(batch=(1, 10), CO=128, HH=2, WW=2, CI=10)
-
-    def test_1x1_conv_with_bias(self):
-        self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2, with_bias=True)
-        self._test_simple_1x1_conv(
-            batch=3,
-            CO=100,
-            HH=200,
-            WW=4,
-            CI=2,
-            activation="relu",
-            with_bias=True,
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid", with_bias=True
-        )
-        self._test_simple_1x1_conv(
-            batch=2, CO=64, HH=10, WW=42, CI=3, activation="hardswish", with_bias=True
-        )
-        self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13, with_bias=True)
-        self._test_simple_1x1_conv(
-            batch=(1, 10), CO=128, HH=2, WW=2, CI=10, with_bias=True
-        )
+    # !!! SKIPPED TESTS BELOW !!!
+    # TODO: enable the tests when ck is fixed
+
+    # def test_1x1_conv_no_bias(self):
+    #     self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2)
+    #     self._test_simple_1x1_conv(
+    #         batch=3, CO=100, HH=200, WW=4, CI=2, activation="relu"
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid"
+    #     )
+    #     self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13)
+    #     self._test_simple_1x1_conv(batch=(1, 10), CO=128, HH=2, WW=2, CI=10)
+
+    # def test_1x1_conv_with_bias(self):
+    #     self._test_simple_1x1_conv(batch=1, CO=256, HH=3, WW=4, CI=2, with_bias=True)
+    #     self._test_simple_1x1_conv(
+    #         batch=3,
+    #         CO=100,
+    #         HH=200,
+    #         WW=4,
+    #         CI=2,
+    #         activation="relu",
+    #         with_bias=True,
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=128, HH=10, WW=42, CI=3, activation="sigmoid", with_bias=True
+    #     )
+    #     self._test_simple_1x1_conv(
+    #         batch=2, CO=64, HH=10, WW=42, CI=3, activation="hardswish", with_bias=True
+    #     )
+    #     self._test_simple_1x1_conv(batch=5, CO=256, HH=15, WW=5, CI=13, with_bias=True)
+    #     self._test_simple_1x1_conv(
+    #         batch=(1, 10), CO=128, HH=2, WW=2, CI=10, with_bias=True
+    #     )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/compiler/test_transform_toposort.py b/tests/unittest/compiler/test_transform_toposort.py
new file mode 100644
index 000000000..0642e88b3
--- /dev/null
+++ b/tests/unittest/compiler/test_transform_toposort.py
@@ -0,0 +1,91 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import Tensor
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.compiler.transform.toposort import (
+    _dfsSort,
+    _priSort,
+    SizePriTensorHelper,
+)
+from aitemplate.testing import detect_target
+
+
+class TestTopoSort(unittest.TestCase):
+    def _get_diff_size_graph(self):
+        X1 = Tensor(shape=[10, 50], dtype="float16", name="in_10_50")
+        X2 = Tensor(shape=[50, 1000], dtype="float16", name="in_50_1000")
+        X3 = Tensor(shape=[1000, 5], dtype="float16", name="in_1000_5")
+        X4 = Tensor(shape=[5, 5], dtype="float16", name="in_5_5")
+        X5 = ops.gemm_rrr()(X1, X2)
+        X5._attrs["name"] = "MUL_10_1000"
+        X6 = ops.gemm_rrr()(X3, X4)
+        X6._attrs["name"] = "MUL_1000_5"
+        X7 = ops.gemm_rrr()(X5, X6)
+        X7._attrs["name"] = "MUL_10_5"
+        X7._attrs["is_output"] = True
+        return X7
+
+    def test_very_deep_toposort(self):
+        x = Tensor(
+            [2, 10],
+            is_input=True,
+            name="x",
+        )
+
+        for _ in range(1000):
+            x = ops.elementwise(FuncEnum.RELU)(x)
+
+        x._attrs["is_output"] = True
+        x._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(x, target, "./tmp", "test_very_deep_toposort")
+
+        x_pt = torch.randn((2, 10)).half().cuda()
+        out_pt = torch.relu(x_pt)
+
+        out_ait = torch.empty_like(out_pt)
+        module.run_with_tensors({"x": x_pt}, {"output": out_ait})
+
+        self.assertTrue(torch.equal(out_ait, out_pt))
+
+    def test_size_pri_toposort(self):
+        tensor = self._get_diff_size_graph()
+        expected_order = [
+            "in_10_50",
+            "in_50_1000",
+            "in_1000_5",
+            "in_5_5",
+            "MUL_10_1000",
+            "MUL_1000_5",
+            "MUL_10_5",
+        ]
+        self.assertEqual(
+            [node._attrs["name"] for node in _priSort(tensor, SizePriTensorHelper())],
+            expected_order,
+        )
+
+        # dfs don't follow size pri order
+        self.assertNotEqual(
+            [node._attrs["name"] for node in _dfsSort(tensor)], expected_order
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/compiler/test_view_strided_op.py b/tests/unittest/compiler/test_view_strided_op.py
index 3589449c7..6f637eb81 100644
--- a/tests/unittest/compiler/test_view_strided_op.py
+++ b/tests/unittest/compiler/test_view_strided_op.py
@@ -22,6 +22,11 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target, test_utils
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 from parameterized import param, parameterized
@@ -31,30 +36,60 @@ def _gen_fusible_view_ops_before_strided_op(
     name: str, batch_dim: Optional[IntVar], n1: int, n2: int
 ) -> List[Tensor]:
     assert n2 % 2 == 0, f"n2 must be even! n2: {n2}"
+    target = detect_target()
+    support_float = target.name() == "cuda" and int(target._arch) >= 80
     if batch_dim is not None:
-        return [
+        test_ops = [
             ops.reshape()(
-                test_utils.gen_input_tensor([batch_dim, n1 * n2], name),
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1 * n2], name=name, dtype="float16"
+                ),
                 [-1, n1, n2],
             ),
             ops.flatten(start_dim=2, end_dim=-1)(
-                test_utils.gen_input_tensor([batch_dim, n1, int(n2 / 2), 2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, int(n2 / 2), 2], name=name, dtype="float16"
+                )
             ),
             ops.squeeze(dim=1)(
-                test_utils.gen_input_tensor([batch_dim, 1, n1, n2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, 1, n1, n2], name=name, dtype="float16"
+                )
             ),
         ]
+        if support_float:
+            test_ops.append(
+                ops.reshape()(
+                    test_utils.gen_input_tensor(
+                        [batch_dim, n1 * n2], name=name, dtype="float"
+                    ),
+                    [-1, n1, n2],
+                )
+            )
     else:
-        return [
+        test_ops = [
             ops.reshape()(
-                test_utils.gen_input_tensor([n1 * n2], name),
+                test_utils.gen_input_tensor([n1 * n2], name=name, dtype="float16"),
                 [n1, n2],
             ),
             ops.flatten(start_dim=1, end_dim=-1)(
-                test_utils.gen_input_tensor([n1, int(n2 / 2), 2], name)
+                test_utils.gen_input_tensor(
+                    [n1, int(n2 / 2), 2], name=name, dtype="float16"
+                )
+            ),
+            ops.squeeze(dim=0)(
+                test_utils.gen_input_tensor([1, n1, n2], name=name, dtype="float16")
             ),
-            ops.squeeze(dim=0)(test_utils.gen_input_tensor([1, n1, n2], name)),
         ]
+        if support_float:
+            test_ops.append(
+                ops.flatten(start_dim=1, end_dim=-1)(
+                    test_utils.gen_input_tensor(
+                        [n1, int(n2 / 2), 2], name=name, dtype="float"
+                    )
+                ),
+            )
+    return test_ops
 
 
 def _gen_non_fusible_view_ops_before_strided_op(
@@ -64,34 +99,65 @@ def _gen_non_fusible_view_ops_before_strided_op(
         name=batch_dim._attrs["name"],
         values=[int(value / 2) for value in batch_dim._attrs["values"]],
     )
-    return [
+    test_ops = [
         ops.reshape()(
-            test_utils.gen_input_tensor([new_batch_dim, n1, n2 * 2], name),
+            test_utils.gen_input_tensor(
+                [new_batch_dim, n1, n2 * 2], name=name, dtype="float16"
+            ),
             [-1, n1, n2],
         ),
         ops.flatten(start_dim=0, end_dim=1)(
-            test_utils.gen_input_tensor([new_batch_dim, 2, n1, n2], name)
+            test_utils.gen_input_tensor(
+                [new_batch_dim, 2, n1, n2], name=name, dtype="float16"
+            )
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_ops.append(
+            ops.reshape()(
+                test_utils.gen_input_tensor(
+                    [new_batch_dim, n1, n2 * 2], name=name, dtype="float"
+                ),
+                [-1, n1, n2],
+            )
+        )
+    return test_ops
 
 
 def _gen_multiple_fusible_view_ops_before_strided_op(
     name: str, batch_dim: IntVar, n1: int, n2: int
 ) -> List[Tensor]:
-    return [
+    test_ops = [
         ops.reshape()(
             ops.reshape()(
-                test_utils.gen_input_tensor([batch_dim, n1, n2], name),
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, n2], name=name, dtype="float16"
+                ),
                 [-1, n1 * n2],
             ),
             [-1, n1, n2],
         ),
         ops.squeeze(dim=1)(
             ops.unsqueeze(dim=1)(
-                test_utils.gen_input_tensor([batch_dim, n1, n2], name)
+                test_utils.gen_input_tensor(
+                    [batch_dim, n1, n2], name=name, dtype="float16"
+                )
             ),
         ),
     ]
+    target = detect_target()
+    if target.name() == "cuda" and int(target._arch) >= 80:
+        test_ops.append(
+            ops.squeeze(dim=1)(
+                ops.unsqueeze(dim=1)(
+                    test_utils.gen_input_tensor(
+                        [batch_dim, n1, n2], name=name, dtype="float16"
+                    )
+                )
+            )
+        )
+    return test_ops
 
 
 def custom_name_func(testcase_func, param_num, param):
@@ -104,6 +170,7 @@ def _gen_view_bmm_module(
         input0: Tensor,
         input1: Tensor,
         test_name: str,
+        dtype: str,
         expected_num_tensors: int,
         expected_num_ops: int,
         num_bmms: int = 1,
@@ -117,7 +184,7 @@ def _gen_view_bmm_module(
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", test_name)
+        module = compile_model(Ys, target, "./tmp", f"{test_name}_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -157,6 +224,7 @@ def _test_view_and_bmm(
                 f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_fusible_view_ops_before_strided_op(
@@ -173,30 +241,26 @@ def _test_view_and_bmm(
         name_func=custom_name_func,
     )
     def test_single_view_and_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+            input0, input1, test_name, dtype, expected_num_tensors=3, expected_num_ops=1
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": batch_size}
             self._test_view_and_bmm(
@@ -215,6 +279,7 @@ def test_single_view_and_bmm_fusible(
                 f"{test_utils.get_src_op_name(tensor0)}_multi_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_fusible_view_ops_before_strided_op(
@@ -231,7 +296,7 @@ def test_single_view_and_bmm_fusible(
         name_func=custom_name_func,
     )
     def test_single_view_and_multi_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
@@ -241,6 +306,7 @@ def test_single_view_and_multi_bmm_fusible(
             input0,
             input1,
             test_name,
+            dtype,
             expected_num_tensors=4,
             expected_num_ops=2,
             num_bmms=2,
@@ -250,16 +316,12 @@ def test_single_view_and_multi_bmm_fusible(
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y0 = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y0 = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             y1 = y0.clone()
             dim_to_value_dict = {"batch_size": batch_size}
@@ -272,13 +334,15 @@ def test_single_view_and_multi_bmm_fusible(
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
-    def test_multi_view_and_multi_bmm_fusible(self):
+    def _test_multi_view_and_multi_bmm_fusible(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N0 = 13
         N1 = 46
         N2 = 5
-        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
-        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
+        X0 = test_utils.gen_input_tensor(
+            [batch_dim, N0 * N1], name="input0", dtype=dtype
+        )
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], name="input1", dtype=dtype)
         X2 = ops.reshape()(X0, [-1, N0, N1])
         X3 = ops.reshape()(X0, [-1, N0, N1])
         X4 = ops.reshape()(X1, [-1, N2, N1])
@@ -297,7 +361,9 @@ def test_multi_view_and_multi_bmm_fusible(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", "multi_view_multi_bmm_fusion")
+        module = compile_model(
+            Ys, target, "./tmp", f"multi_view_multi_bmm_fusion_{dtype}"
+        )
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -309,16 +375,12 @@ def test_multi_view_and_multi_bmm_fusible(self):
         a_shape = X2._attrs["shape"]
         b_shape = X4._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y0 = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y0 = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             y1 = y0.clone()
             dim_to_value_dict = {"batch_size": batch_size}
@@ -331,6 +393,13 @@ def test_multi_view_and_multi_bmm_fusible(self):
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
+    def test_multi_view_and_multi_bmm_fusible(self):
+        self._test_multi_view_and_multi_bmm_fusible()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_multi_view_and_multi_bmm_fusible_fp32_sm80(self):
+        self._test_multi_view_and_multi_bmm_fusible(dtype="float")
+
     @parameterized.expand(
         [
             param(
@@ -338,6 +407,7 @@ def test_multi_view_and_multi_bmm_fusible(self):
                 f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_multiple_fusible_view_ops_before_strided_op(
@@ -354,7 +424,7 @@ def test_multi_view_and_multi_bmm_fusible(self):
         name_func=custom_name_func,
     )
     def test_multiple_view_and_bmm_fusible(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(
             test_utils.get_src_input(input0)
@@ -365,23 +435,19 @@ def test_multiple_view_and_bmm_fusible(
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=3, expected_num_ops=1
+            input0, input1, test_name, dtype, expected_num_tensors=3, expected_num_ops=1
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = torch.randn([dim.value() for dim in b_shape]).cuda().half()
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor([dim.value() for dim in b_shape], dtype)
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": batch_size}
             self._test_view_and_bmm(
@@ -396,9 +462,11 @@ def test_multiple_view_and_bmm_fusible(
     @parameterized.expand(
         [
             param(
-                f"non_fusible_{test_utils.get_src_op_name(tensor0)}_{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
+                f"non_fusible_{test_utils.get_src_op_name(tensor0)}_"
+                f"{test_utils.get_src_op_name(tensor0)}_bmm_fusion",
                 tensor0,
                 tensor1,
+                tensor0._attrs["dtype"],
             )
             for tensor0, tensor1 in zip(
                 _gen_non_fusible_view_ops_before_strided_op(
@@ -418,34 +486,28 @@ def test_multiple_view_and_bmm_fusible(
         name_func=custom_name_func,
     )
     def test_non_fusible_view_and_bmm(
-        self, test_name: str, input0: Tensor, input1: Tensor
+        self, test_name: str, input0: Tensor, input1: Tensor, dtype: str
     ):
         orig_a_shape = test_utils.get_src_input(input0)._attrs["shape"]
         orig_b_shape = test_utils.get_src_input(input1)._attrs["shape"]
 
         # Gen module.
         module = self._gen_view_bmm_module(
-            input0, input1, test_name, expected_num_tensors=5, expected_num_ops=3
+            input0, input1, test_name, dtype, expected_num_tensors=5, expected_num_ops=3
         )
 
         # Prepae PyTorch tensors.
         a_shape = input0._attrs["shape"]
         b_shape = input1._attrs["shape"]
         for batch_size in a_shape[0]._attrs["values"]:
-            x0_pt = (
-                torch.randn(batch_size, a_shape[1].value(), a_shape[2].value())
-                .cuda()
-                .half()
+            x0_pt = get_random_torch_tensor(
+                [batch_size, a_shape[1].value(), a_shape[2].value()], dtype
             )
-            x1_pt = (
-                torch.randn(batch_size, b_shape[1].value(), b_shape[2].value())
-                .cuda()
-                .half()
+            x1_pt = get_random_torch_tensor(
+                [batch_size, b_shape[1].value(), b_shape[2].value()], dtype
             )
-            y = (
-                torch.empty([batch_size, a_shape[1].value(), b_shape[1].value()])
-                .cuda()
-                .half()
+            y = get_torch_empty_tensor(
+                [batch_size, a_shape[1].value(), b_shape[1].value()], dtype
             )
             dim_to_value_dict = {"batch_size": int(batch_size / 2)}
             self._test_view_and_bmm(
@@ -457,14 +519,16 @@ def test_non_fusible_view_and_bmm(
                 test_utils.get_shape(orig_b_shape, dim_to_value_dict),
             )
 
-    def test_single_view_and_gemm_fusible(self):
+    def _test_single_view_and_gemm_fusible(self, dtype="float16"):
         batch_dim = IntVar([1, 128, 256], "batch_size")
         N0 = 13
         N1 = 46
         N2 = 6
-        X0 = test_utils.gen_input_tensor([batch_dim, N0 * N1], "input0")
-        X1 = test_utils.gen_input_tensor([1, N2 * N1], "input1")
-        X2 = test_utils.gen_input_tensor([N2], "input2")
+        X0 = test_utils.gen_input_tensor(
+            [batch_dim, N0 * N1], name="input0", dtype=dtype
+        )
+        X1 = test_utils.gen_input_tensor([1, N2 * N1], name="input1", dtype=dtype)
+        X2 = test_utils.gen_input_tensor([N2], name="input2", dtype=dtype)
         X3 = ops.reshape()(X0, [-1, N0, N1])
         X4 = ops.reshape()(X1, [N2, N1])
         X5 = ops.reshape()(X1, [N1, N2])
@@ -480,7 +544,7 @@ def test_single_view_and_gemm_fusible(self):
 
         # Gen module.
         target = detect_target()
-        module = compile_model(Ys, target, "./tmp", "single_view_gemm_fusion")
+        module = compile_model(Ys, target, "./tmp", f"single_view_gemm_fusion_{dtype}")
 
         # Verify the generated graph.
         sorted_graph = module.debug_sorted_graph
@@ -490,9 +554,9 @@ def test_single_view_and_gemm_fusible(self):
 
         # Prepae PyTorch tensors.
         for batch_size in batch_dim._attrs["values"]:
-            x0_pt = torch.randn(batch_size, N0 * N1).cuda().half()
-            x1_pt = torch.randn(1, N2 * N1).cuda().half()
-            x2_pt = torch.randn(N2).cuda().half()
+            x0_pt = get_random_torch_tensor([batch_size, N0 * N1], dtype)
+            x1_pt = get_random_torch_tensor([1, N2 * N1], dtype)
+            x2_pt = get_random_torch_tensor([N2], dtype)
             x3_pt = torch.reshape(x0_pt, [-1, N0, N1])
             x4_pt = torch.reshape(x1_pt, [N2, N1])
             x5_pt = torch.reshape(x1_pt, [N1, N2])
@@ -501,9 +565,9 @@ def test_single_view_and_gemm_fusible(self):
             y2_pt = torch.nn.functional.linear(x3_pt, x5_pt.transpose(0, 1))
             y_pts = [y0_pt, y1_pt, y2_pt]
             ys = [
-                torch.empty(batch_size, N0, N2).cuda().half(),
-                torch.empty(batch_size, N0, N2).cuda().half(),
-                torch.empty(batch_size, N0, N2).cuda().half(),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
+                get_torch_empty_tensor([batch_size, N0, N2], dtype),
             ]
 
             # Run AITemplate module.
@@ -514,6 +578,15 @@ def test_single_view_and_gemm_fusible(self):
             for y, y_pt in zip(ys, y_pts):
                 self.assertTrue(torch.allclose(y, y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_single_view_and_gemm_fusible(self):
+        self._test_single_view_and_gemm_fusible()
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_single_view_and_gemm_fusible_fp32_sm80(self):
+        self._test_single_view_and_gemm_fusible(dtype="float")
+
+
+filter_test_cases_by_test_env(ViewStridedOpTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/frontend/test_module.py b/tests/unittest/frontend/test_module.py
index 7d0db2104..1c397eaf3 100644
--- a/tests/unittest/frontend/test_module.py
+++ b/tests/unittest/frontend/test_module.py
@@ -13,7 +13,7 @@
 #  limitations under the License.
 #
 import unittest
-from typing import OrderedDict
+from collections import OrderedDict
 
 import torch
 import torch as pt
@@ -59,7 +59,7 @@ def forward(self, x):
         b = PTModule()
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_1(self):
@@ -100,7 +100,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_sequential_2(self):
@@ -153,7 +153,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
     def test_module_dict(self):
@@ -238,7 +238,7 @@ def forward(self, x):
         ait_param_names = [x[0] for x in a.named_parameters()]
         pt_param_names = [x[0] for x in b.named_parameters()]
 
-        for (x, y) in zip(ait_param_names, pt_param_names):
+        for x, y in zip(ait_param_names, pt_param_names):
             assert x == y
 
 
diff --git a/tests/unittest/ops/test_activation.py b/tests/unittest/ops/test_activation.py
index 085304905..8f78dcdcc 100644
--- a/tests/unittest/ops/test_activation.py
+++ b/tests/unittest/ops/test_activation.py
@@ -24,23 +24,49 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
+
+
+TORCH_EQUIVALENTS = {
+    FuncEnum.TANH: torch.tanh,
+    FuncEnum.COS: torch.cos,
+    FuncEnum.SIN: torch.sin,
+    FuncEnum.SIGN: torch.sign,
+    FuncEnum.ABS: torch.abs,
+    FuncEnum.LOGE: torch.log,
+    FuncEnum.EXP: torch.exp,
+    FuncEnum.SQRT: torch.sqrt,
+    FuncEnum.SIGMOID: torch.sigmoid,
+    FuncEnum.RELU: torch.relu,
+    FuncEnum.CELU: torch.celu,
+}
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedElementwiseTestCase(unittest.TestCase):
     def _test_leaky_relu(
-        self, input_size, negative_slope=0.01, test_name="leaky_relu", copy_op=False
+        self,
+        input_size,
+        negative_slope=0.01,
+        test_name="leaky_relu",
+        copy_op=False,
+        dtype="float16",
     ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         slope = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="slope",
             value=negative_slope,
         )
@@ -52,61 +78,81 @@ def _test_leaky_relu(
         X2._attrs["name"] = "output0"
 
         target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = get_random_torch_tensor(input_size, dtype=dtype)
         OP_pt = torch.nn.LeakyReLU(negative_slope)
         x2_pt = OP_pt(x1_pt)
 
         x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
-    def _test_relu(self, input_size, test_name="relu", copy_op=False):
+    def _test_floor_div(
+        self,
+        input_size,
+        test_name="floor_div",
+        dividend=2,
+        copy_op=False,
+        dtype="float16",
+    ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
-        X2_op = ops.elementwise(FuncEnum.RELU)
+        slope = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="input1",
+            value=dividend,
+        )
+        X2_op = ops.elementwise(FuncEnum.FLOOR_DIV)
+
         if copy_op:
             X2_op = ops.elementwise(**X2_op._get_op_attributes())
-        X2 = X2_op(X1)
+        X2 = X2_op(X1, slope)
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
         target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
-        x2_pt = torch.relu(x1_pt)
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        x2_pt = torch.div(x1_pt, dividend, rounding_mode="floor")
 
         x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_hardtanh(
-        self, input_size, min_val=-1, max_val=1, test_name="hard_tanh", copy_op=False
+        self,
+        input_size,
+        min_val=-1,
+        max_val=1,
+        test_name="hard_tanh",
+        copy_op=False,
+        dtype="float16",
     ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X_min = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="min_val",
             value=min_val,
             is_input=True,
         )
         X_max = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="max_val",
             value=max_val,
             is_input=True,
@@ -119,36 +165,42 @@ def _test_hardtanh(
         X2._attrs["name"] = "output0"
 
         target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = get_random_torch_tensor(input_size, dtype)
         OP_pt = torch.nn.Hardtanh(min_val=min_val, max_val=max_val)
         x2_pt = OP_pt(x1_pt)
 
         x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
     def _test_softplus(
-        self, input_size, beta=1.0, threshold=20.0, test_name="softplus", copy_op=False
+        self,
+        input_size,
+        beta=1.0,
+        threshold=20.0,
+        test_name="softplus",
+        copy_op=False,
+        dtype="float16",
     ):
         assert len(input_size) == 2
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X_beta = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="beta",
             value=beta,
             is_input=True,
         )
         X_threshold = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="threshold",
             value=threshold,
             is_input=True,
@@ -161,32 +213,236 @@ def _test_softplus(
         X2._attrs["name"] = "output0"
 
         target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
 
-        x1_pt = torch.randn(input_size).cuda().half()
+        x1_pt = get_random_torch_tensor(input_size, dtype)
         OP_pt = torch.nn.Softplus(beta=beta, threshold=threshold)
         x2_pt = OP_pt(x1_pt)
 
         x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
 
-    def test_lrelu(self):
-        self._test_leaky_relu([512, 512], test_name="leaky_relu_1")
+    def _test_simple_function(
+        self, input_size, function, test_name, copy_op=False, dtype="float16"
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(function)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        x2_pt = TORCH_EQUIVALENTS[function](x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
+
+    def _test_elu(
+        self, input_size, alpha=1.0, test_name="elu", copy_op=False, dtype="float16"
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_alpha = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="alpha",
+            value=alpha,
+        )
+        X2_op = ops.elementwise(FuncEnum.ELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_alpha)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.ELU(alpha=alpha)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_softsign(
+        self,
+        input_size,
+        test_name="softsign",
+        copy_op=False,
+        dtype="float16",
+    ):
+        X1 = Tensor(
+            shape=[IntImm(dim) for dim in input_size],
+            dtype=dtype,
+            name="input",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.SOFTSIGN)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.Softsign()
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_celu(
+        self,
+        input_size,
+        alpha=1.0,
+        test_name="celu",
+        copy_op=False,
+        dtype="float16",
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X_alpha = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="alpha",
+            value=alpha,
+        )
+        X2_op = ops.elementwise(FuncEnum.CELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1, X_alpha)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.CELU(alpha=alpha)
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_gelu(self, input_size, test_name="gelu", copy_op=False, dtype="float16"):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.GELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.GELU()
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    def _test_fast_gelu(
+        self, input_size, test_name="fast_gelu", copy_op=False, dtype="float16"
+    ):
+        assert len(input_size) == 2
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2_op = ops.elementwise(FuncEnum.FASTGELU)
+        if copy_op:
+            X2_op = ops.elementwise(**X2_op._get_op_attributes())
+        X2 = X2_op(X1)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", f"{test_name}_{dtype}")
+        x1_pt = get_random_torch_tensor(input_size, dtype)
+        OP_pt = torch.nn.GELU(approximate="tanh")
+        x2_pt = OP_pt(x1_pt)
+
+        x2 = torch.empty_like(x2_pt)
+        module.run_with_tensors([x1_pt], [x2])
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_lrelu(self, dtype):
+        self._test_leaky_relu([512, 512], test_name="leaky_relu_1", dtype=dtype)
         self._test_leaky_relu(
-            [1024, 1024], negative_slope=0.5, test_name="leaky_relu_2"
+            [1024, 1024],
+            negative_slope=0.5,
+            test_name="leaky_relu_2",
+            dtype=dtype,
         )
         self._test_leaky_relu(
             [1024, 1024],
             negative_slope=0.5,
             test_name="leaky_relu_2_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
+        self._test_leaky_relu([63, 63], test_name="leaky_relu_3", dtype=dtype)
 
-    def test_htanh(self):
-        self._test_hardtanh([512, 512], test_name="hard_tanh_1")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_htanh(self, dtype):
+        self._test_hardtanh([511, 511], test_name="hard_tanh_1", dtype=dtype)
         self._test_hardtanh(
-            [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2"
+            [1024, 1024], min_val=-2, max_val=2, test_name="hard_tanh_2", dtype=dtype
         )
         self._test_hardtanh(
             [1024, 1024],
@@ -194,22 +450,373 @@ def test_htanh(self):
             max_val=2,
             test_name="hard_tanh_2_copy_op",
             copy_op=True,
+            dtype=dtype,
         )
 
-    def test_relu(self):
-        self._test_relu([512, 512], test_name="relu_1")
-        self._test_relu([512, 512], test_name="relu_1_copy_op", copy_op=True)
-
-    def test_softplus(self):
-        self._test_softplus([64, 64], test_name="softplus_1")
-        self._test_softplus([128, 128], beta=1.0, threshold=1.5, test_name="softplus_2")
-        self._test_softplus([128, 256], beta=2.0, threshold=0.5, test_name="softplus_3")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_softplus(self, dtype):
+        self._test_softplus([63, 63], test_name="softplus_1", dtype=dtype)
+        self._test_softplus(
+            [128, 128], beta=1.0, threshold=1.5, test_name="softplus_2", dtype=dtype
+        )
+        self._test_softplus(
+            [128, 256], beta=2.0, threshold=0.5, test_name="softplus_3", dtype=dtype
+        )
         self._test_softplus(
             [256, 128],
             beta=1.0,
             threshold=1.0,
             test_name="softplus_3_copy_op",
             copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_cos(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.COS, test_name="cos_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.COS,
+            test_name="cos_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sin(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.SIN, test_name="sin_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIN,
+            test_name="sin_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_tanh(self, dtype):
+        self._test_simple_function(
+            [512, 512], FuncEnum.TANH, test_name="tanh_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [1, 1], FuncEnum.TANH, test_name="tanh_2", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.TANH,
+            test_name="tanh_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sign(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.SIGN, test_name="sign_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIGN,
+            test_name="sign_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_abs(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.ABS, test_name="abs_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.ABS,
+            test_name="abs_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_loge(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.LOGE, test_name="loge_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.LOGE,
+            test_name="loge_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_exp(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.EXP, test_name="exp_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.EXP,
+            test_name="exp_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sqrt(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.SQRT, test_name="sqrt_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SQRT,
+            test_name="sqrt_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sigmoid(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.SIGMOID, test_name="sigmoid_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.SIGMOID,
+            test_name="sigmoid_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_relu(self, dtype):
+        self._test_simple_function(
+            [511, 511], FuncEnum.RELU, test_name="relu_1", dtype=dtype
+        )
+        self._test_simple_function(
+            [512, 512],
+            FuncEnum.RELU,
+            test_name="relu_1_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_elu(self, dtype):
+        self._test_elu([63, 63], test_name="elu_1", dtype=dtype)
+        self._test_elu([128, 128], alpha=4.0, test_name="elu_2", dtype=dtype)
+        self._test_elu([128, 256], alpha=0.4, test_name="elu_3", dtype=dtype)
+        self._test_elu(
+            [256, 128],
+            alpha=1.0,
+            test_name="elu_3_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_softsign(self, dtype):
+        self._test_softsign(
+            [63, 63],
+            test_name="softsign_1",
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [128],
+            test_name="softsign_2",
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [128],
+            test_name="softsign_3",
+            copy_op=True,
+            dtype=dtype,
+        )
+        self._test_softsign(
+            [121, 128],
+            test_name="softsign_4",
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_floor_div(self, dtype):
+        self._test_floor_div(
+            [511, 511],
+            test_name="floor_div_1",
+            dtype=dtype,
+        )
+        self._test_floor_div(
+            [1024, 1024],
+            dividend=3,
+            test_name="test_floor_div_2_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_celu(self, dtype):
+        self._test_celu([63, 63], alpha=1.0, test_name="celu_1", dtype=dtype)
+        self._test_celu([128, 128], alpha=4.0, test_name="celu_2", dtype=dtype)
+        self._test_celu([128, 256], alpha=0.4, test_name="celu_3", dtype=dtype)
+        self._test_celu(
+            [256, 128], alpha=1.0, test_name="celu_3_copy_op", copy_op=True, dtype=dtype
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_gelu(self, dtype):
+        self._test_gelu([63, 63], test_name="gelu_1", dtype=dtype)
+        self._test_gelu([128, 128], test_name="gelu_2", dtype=dtype)
+        self._test_gelu([128, 256], test_name="gelu_3", dtype=dtype)
+        self._test_gelu(
+            [256, 128], test_name="gelu_4_copy_op", copy_op=True, dtype=dtype
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fast_gelu(self, dtype):
+        self._test_fast_gelu([63, 63], test_name="fast_gelu_1", dtype=dtype)
+        self._test_fast_gelu([128, 128], test_name="fast_gelu_2", dtype=dtype)
+        self._test_fast_gelu([128, 256], test_name="fast_gelu_3", dtype=dtype)
+        self._test_fast_gelu(
+            [256, 128], test_name="fast_gelu_4_copy_op", copy_op=True, dtype=dtype
         )
 
 
diff --git a/tests/unittest/ops/test_argmax.py b/tests/unittest/ops/test_argmax.py
index 6dc91dc01..3aa93c604 100644
--- a/tests/unittest/ops/test_argmax.py
+++ b/tests/unittest/ops/test_argmax.py
@@ -21,18 +21,24 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
-class argmaxTestCase(unittest.TestCase):
+class ArgmaxTestCase(unittest.TestCase):
     def _test_argmax(
-        self, batch_size=1, shape=(2, 6), dim=0, test_name="argmax", copy_op=False
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        dim=0,
+        test_name="argmax",
+        copy_op=False,
+        dtype="float16",
     ):
-
         o_shape = list(shape)[:-1]
 
         X1 = Tensor(
             shape=shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -46,7 +52,7 @@ def _test_argmax(
         target = detect_target()
         module = compile_model(X4, target, "./tmp", test_name)
 
-        scores = torch.rand(shape).cuda().half()
+        scores = get_random_torch_tensor(shape, dtype=dtype)
         y_pt = torch.argmax(scores, dim=dim)
         y = torch.empty_like(y_pt, dtype=torch.int64)
 
@@ -54,10 +60,35 @@ def _test_argmax(
         y_reshape = y.reshape(o_shape)
         self.assertTrue(torch.allclose(y_pt, y_reshape, atol=1e-2, rtol=1e-2))
 
-    def test_argmax(self):
-        self._test_argmax(shape=(300, 80), dim=1, test_name="argmax")
+    def test_fp16(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp16",
+            dtype="float16",
+        )
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp16_copy_op",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    def test_fp32(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp32",
+            dtype="float32",
+        )
         self._test_argmax(
-            shape=(300, 80), dim=1, test_name="argmax_copy_op", copy_op=True
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_fp32_copy_op",
+            copy_op=True,
+            dtype="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_argmax_sm80.py b/tests/unittest/ops/test_argmax_sm80.py
new file mode 100644
index 000000000..aeb4e498c
--- /dev/null
+++ b/tests/unittest/ops/test_argmax_sm80.py
@@ -0,0 +1,77 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for argmax Operator.
+"""
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class ArgmaxSM80TestCase(unittest.TestCase):
+    def _test_argmax(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        dim=0,
+        test_name="argmax",
+        copy_op=False,
+        dtype="float16",
+    ):
+        o_shape = list(shape)[:-1]
+
+        X1 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="X",
+            is_input=True,
+        )
+        X4_op = ops.argmax(dim=dim)
+        X4 = X4_op(X1)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(X4, target, "./tmp", test_name)
+
+        scores = get_random_torch_tensor(shape, dtype=dtype)
+        y_pt = torch.argmax(scores, dim=dim)
+        y = torch.empty_like(y_pt, dtype=torch.int64)
+
+        module.run_with_tensors([scores], [y])
+        y_reshape = y.reshape(o_shape)
+        torch.testing.assert_close(y_pt, y_reshape, atol=0, rtol=0)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bfloat16 not supported in ROCm")
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bfloat16 is not supported by CUDA < SM80.",
+    )
+    def test_argmax_bf16(self):
+        self._test_argmax(
+            shape=(300, 80),
+            dim=1,
+            test_name="argmax_bf16",
+            dtype="bfloat16",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_attention.py b/tests/unittest/ops/test_attention.py
index 893d28aa6..18010ed99 100644
--- a/tests/unittest/ops/test_attention.py
+++ b/tests/unittest/ops/test_attention.py
@@ -13,8 +13,10 @@
 #  limitations under the License.
 #
 """
-Unittests for flash_attenion Operator.
+Unittests for flash_attention Operator.
 """
+import itertools
+import logging
 import math
 import os
 import unittest
@@ -24,11 +26,22 @@
 
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import benchmark_pt, detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
 from einops import rearrange, repeat
 
+from parameterized import parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
+
 
 def unpad_input(hidden_states, attention_mask):
     """
@@ -59,6 +72,21 @@ def index_first_axis(x, indices):
 
 
 def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Reference implementation of scaled dot-product attention. For benchmarking
+    purposes, when possible we use torch.nn.functional.scaled_dot_product_attention,
+    which calls optimized mem.effient and flash attention kernels.
+    """
+    if True or (causal and attn_mask is not None):
+        # SDPA doesn't support causal and custom masks simultaneously,
+        # fall back on manual implementation
+        return attention_ref_math(
+            qkv, attn_mask, dropout_p, upcast=upcast, causal=causal
+        )
+    return attention_ref_sdpa(qkv, attn_mask, dropout_p, upcast=upcast, causal=causal)
+
+
+def attention_ref_sdpa(qkv, attn_mask, dropout_p, upcast=False, causal=False):
     """
     Arguments:
         qkv: (batch_size, seqlen, 3, nheads, head_dim)
@@ -69,10 +97,39 @@ def attention_ref(qkv, attn_mask, dropout_p, upcast=False, causal=False):
         attention: softmax after dropout
     """
     q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
+    q = q.transpose(1, 2)  # to (batch_size, nheads, seqlen, head_dim)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    if attn_mask is not None:
+        # to (batch_size, nheads, seqlen, seqlen)
+        attn_mask = attn_mask.reshape(q.shape[0], 1, 1, q.shape[2])
+    output = torch.nn.functional.scaled_dot_product_attention(
+        q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=causal
+    )
+    output = output.transpose(1, 2)  # to (batch_size, seqlen, nheads, head_dim)
+    return output.to(dtype=qkv.dtype)
+
+
+def attention_ref_math(qkv, attn_mask, dropout_p, upcast=False, causal=False):
+    """
+    Arguments:
+        qkv: (batch_size, seqlen, 3, nheads, head_dim)
+        attn_mask: (batch_size, seqlen), or (batch_size, target_len, seqlen),
+            or (batch_size, nheads, target_len, seqlen), or broadcastable to that shape.
+        dropout_p: float
+    Output:
+        output: (batch_size, seqlen, nheads, head_dim)
+        attention: softmax after dropout
+    """
+    q, k, v = (qkv.float() if upcast else qkv).unbind(dim=2)
     seqlen = qkv.shape[1]
     d = qkv.shape[-1]
     scores = torch.einsum("bthd,bshd->bhts", q, k / math.sqrt(d))
-    scores.masked_fill_(rearrange(~attn_mask, "b s -> b 1 1 s"), float("-inf"))
+    if len(attn_mask.shape) == 2:
+        attn_mask_expanded = rearrange(attn_mask, "b s -> b 1 1 s")
+    elif len(attn_mask.shape) == 3:
+        attn_mask_expanded = rearrange(attn_mask, "b t s -> b 1 t s")
+    scores.masked_fill_(~attn_mask_expanded, float("-inf"))
     if causal:
         causal_mask = torch.triu(
             torch.ones(seqlen, seqlen, dtype=torch.bool, device=qkv.device), 1
@@ -94,7 +151,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     qkv_pt = torch.permute(qkv_pt, [2, 0, 3, 1, 4])  # [3, 1, 12, 4096, 64]
 
     q_pt, k_pt, v_pt = torch.split(qkv_pt, 1, dim=0)  # [1, 1, 12, 4096, 64]
-    scale_pt = torch.tensor(64 ** -0.5)
+    scale_pt = torch.tensor(64**-0.5)
     q_pt = q_pt * (scale_pt)
     # #[12, 4096, 64] * [12, 64, 4096] => [12, 4096, 4096]
     attn_pt = torch.bmm(
@@ -105,7 +162,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     v_pt = torch.reshape(v_pt, [nheads, -1, d])  # [12, 4096, 64]
     y_pt = torch.bmm(attn_pt, v_pt)  # [12, 4096, 64]
     y_pt = torch.reshape(y_pt, [1, nheads, seqlen, d])
-    Y_pt = torch.permute(y_pt, [0, 2, 1, 3]).cuda().half()  # [1,4096,12,64]
+    Y_pt = torch.permute(y_pt, [0, 2, 1, 3])  # [1,4096,12,64]
     return Y_pt
 
 
@@ -138,8 +195,11 @@ def T(t):
     return out.permute((0, 2, 1, 3))
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class attentionTestCase(unittest.TestCase):
+class AttentionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
     def _test_flash_attention(
         self,
         batch_size=16,
@@ -148,143 +208,191 @@ def _test_flash_attention(
         n=1024,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="flash_attention",
         rebuild=True,
         benchmark_pt=False,
         copy_op=False,
     ):
-
+        torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
-        x = torch.randn(
-            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
-        )
-        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
-
-        lengths = torch.tensor(
-            [seqlen] * batch_size, dtype=torch.int, device="cuda"
-        ).reshape(-1, 1)
-        attention_mask_bool = (
-            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
-            < lengths
-        )
-        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
-        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
-
-        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-            x, attention_mask_bool
-        )
-        qkv_unpad = (
-            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        qkv = (
-            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
-        y_pt = output.detach()
-
-        total, _, num_heads, head_size = qkv_unpad.shape
+        with torch.no_grad():
+            x = torch.randn(
+                batch_size,
+                seqlen,
+                n,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            Wqkv = torch.nn.Linear(
+                nheads * d,
+                3 * nheads * d,
+                device=device,
+                dtype=torch_dtype,
+            )
 
-        X1 = Tensor(
-            shape=[total, 3, num_heads, head_size],
-            dtype="float16",
-            name="qkv",
-            is_input=True,
-        )
-        X2 = Tensor(
-            shape=[batch_size + 1],
-            dtype="int32",
-            name="cu_seqlens",
-            is_input=True,
-        )
+            lengths = torch.tensor(
+                [seqlen] * batch_size, dtype=torch.int, device="cuda"
+            ).reshape(-1, 1)
+            attention_mask_bool = (
+                repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
+                < lengths
+            )
+            attention_mask = torch.zeros(
+                batch_size,
+                seqlen,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
 
-        flash_attention_op = ops.flash_attention(
-            batch_size=batch_size,
-            dropout=dropout_p,
-            max_seq_len=max_seqlen_in_batch,
-            causal=causal,
-        )
-        if copy_op:
-            flash_attention_op = ops.flash_attention(
-                **flash_attention_op._get_op_attributes()
+            x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                x, attention_mask_bool
             )
-        Y = flash_attention_op(X1, X2)
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
+            qkv_unpad = rearrange(
+                Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads
+            )
+            qkv = rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            y_pt = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
 
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
+            total, _, num_heads, head_size = qkv_unpad.shape
 
-        x1 = qkv_unpad.detach().half().cuda()
-        x2 = cu_seqlens.detach().to(torch.int32).cuda()
-        inputs = {"qkv": x1, "cu_seqlens": x2}
-        y = torch.empty([total, num_heads, head_size]).cuda().half()
-        module.run_with_tensors(inputs, [y])
+            X1 = Tensor(
+                shape=[total, 3, num_heads, head_size],
+                dtype=dtype,
+                name="qkv",
+                is_input=True,
+            )
+            X2 = Tensor(
+                shape=[batch_size + 1],
+                dtype="int32",
+                name="cu_seqlens",
+                is_input=True,
+            )
 
-        # Warm up.
-        for _ in range(5):
+            flash_attention_op = ops.flash_attention(
+                batch_size=batch_size,
+                dropout=dropout_p,
+                max_seq_len=max_seqlen_in_batch,
+                causal=causal,
+            )
+            if copy_op:
+                flash_attention_op = ops.flash_attention(
+                    **flash_attention_op._get_op_attributes()
+                )
+            Y = flash_attention_op(X1, X2)
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output"
+
+            if rebuild:
+                target = detect_target()
+                module = compile_model(Y, target, "./tmp", test_name)
+            else:
+                module = Model(os.path.join("./tmp", test_name, "test.so"))
+
+            x1 = qkv_unpad.to(torch_dtype).cuda()
+            x2 = cu_seqlens.to(torch.int32).cuda()
+            inputs = {"qkv": x1, "cu_seqlens": x2}
+            y = torch.empty(
+                [total, num_heads, head_size],
+                dtype=torch_dtype,
+                device="cuda",
+            )
             module.run_with_tensors(inputs, [y])
-        # Benchmark.
-        time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-            inputs,
-            [y],
-            count=100,
-        )
-        logger.info(__file__, "benchmark flash-attn time: {0}".format(time_per_iter_ms))
 
-        y = y.reshape((batch_size, -1, nheads, d))
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-1, rtol=1e-1))
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark.
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            _LOGGER.info(f"benchmark flash-attn time: {time_per_iter_ms}")
 
-        if benchmark_pt:
-            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+            y = y.reshape((batch_size, -1, nheads, d))
+            torch.testing.assert_close(y, y_pt, atol=1e-3, rtol=1e-3)
 
-            func = attention_ref
-            args = (
-                qkv.cuda().half(),
-                attention_mask_bool.cuda(),
-                dropout_p,
-                False,
-                False,
-            )
-            duration = benchmark_torch_function(100, func, *args)
-            print(
-                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
-            )
+            if benchmark_pt:
+                from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
-    def test_flash_attention(self):
-        if detect_target().name() == "cuda":
-            self._test_flash_attention(test_name="flash_attention")
-            self._test_flash_attention(
-                test_name="flash_attention_copy_op", copy_op=True
-            )
+                func = attention_ref
+                args = (
+                    qkv.to(torch_dtype).cuda(),
+                    attention_mask_bool.cuda(),
+                    dropout_p,
+                    False,
+                    False,
+                )
+                duration = benchmark_torch_function(100, func, *args)
+                print(
+                    f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
+                )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                # Flash attention requires A100
+                TestEnv.CUDA_SM80: [("float16")],
+            }
+        ),
+    )
+    def test_flash_attention(self, dtype):
+        self._test_flash_attention(
+            test_name=f"flash_attention_{dtype}",
+            dtype=dtype,
+        )
+        self._test_flash_attention(
+            test_name=f"flash_attention_{dtype}_copy_op",
+            copy_op=True,
+            dtype=dtype,
+        )
 
-    def _test_attention(self, test_name, rebuild=True, benchmark=False):
+    def _test_attention(
+        self,
+        test_name="attention",
+        rebuild=True,
+        benchmark=False,
+        dtype="float16",
+    ):
         target = detect_target()
         nheads = 12
         d = 64  # head_dim
         seqlen = 4096
         dim = 768
         token_emb_init_range = 0.001
-        X = Tensor(shape=[seqlen, dim], dtype="float16", name="input_0", is_input=True)
+        X = Tensor(
+            shape=[seqlen, dim],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
         qkv_w = Tensor(
-            shape=[dim * 3, dim], dtype="float16", name="input_1", is_input=True
+            shape=[dim * 3, dim],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[dim * 3],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[dim * 3], dtype="float16", name="input_2", is_input=True)
 
         qkv = ops.gemm_rcr_bias_permute(shape=(seqlen, 3, nheads), layout="m2n3")(
             X, qkv_w, B
         )
         (q, k, v) = ops.split()(qkv, 1, dim=0)
-        scale = Tensor(shape=[], dtype="float16", name="input_3", value=(d ** -0.5))
+        scale = Tensor(
+            shape=[],
+            dtype=dtype,
+            name="input_3",
+            value=(d**-0.5),
+        )
         q = ops.elementwise(FuncEnum.MUL)(q, scale)
         attn = ops.bmm_rcr()(
             (ops.reshape()(q, [nheads, -1, d])),
@@ -302,24 +410,26 @@ def _test_attention(self, test_name, rebuild=True, benchmark=False):
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-        X_pt = torch.randn(seqlen, dim).cuda().half() * token_emb_init_range
-        W_pt = torch.randn(dim * 3, dim).cuda().half()
-        B_pt = torch.randn(dim * 3).cuda().half()
+        X_pt = get_random_torch_tensor([seqlen, dim], dtype=dtype)
+        X_pt *= token_emb_init_range
+        W_pt = get_random_torch_tensor([dim * 3, dim], dtype=dtype)
+        B_pt = get_random_torch_tensor([dim * 3], dtype=dtype)
         Y_pt = attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen)
         inputs = {
-            "input_0": X_pt.half(),
-            "input_1": W_pt.half(),
-            "input_2": B_pt.half(),
+            "input_0": X_pt,
+            "input_1": W_pt,
+            "input_2": B_pt,
         }
-        y = torch.empty(Y_pt.shape).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        y = torch.empty_like(Y_pt, dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(y, Y_pt, atol=1e-1, rtol=1e-1)
 
         if benchmark:
             pt_time = benchmark_pt.benchmark_torch_function(
                 100, attention_pt, X_pt, W_pt, B_pt, nheads, d, seqlen
             )
-            logger.info(__file__, "benchmark compiler model time: {0}".format(pt_time))
+            _LOGGER.info(f"benchmark compiler model time: {pt_time}")
 
             # Warm up.
             for _ in range(5):
@@ -330,13 +440,20 @@ def _test_attention(self, test_name, rebuild=True, benchmark=False):
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark compiler model time: {0}".format(time_per_iter_ms)
-            )
-
-    def test_attention(self):
-        if detect_target().name() == "rocm":
-            self._test_attention(test_name="attention")
+            _LOGGER.info(f"benchmark compiler model time: {time_per_iter_ms}")
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.ROCM: [("float16")],
+            }
+        ),
+    )
+    def test_attention_rocm(self, dtype):
+        self._test_attention(
+            test_name=f"attention_{dtype}",
+            dtype=dtype,
+        )
 
     def _test_mem_eff_attention(
         self,
@@ -346,154 +463,388 @@ def _test_mem_eff_attention(
         n=1024,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="mem_eff_attention",
         rebuild=True,
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
         use_perm=True,
+        variable_seq_length_kv=False,
+        variable_seq_length_q=False,
+        skip_pt=False,
+        use_grouped_fmha=False,
+        atol=1e-3,
+        rtol=1e-3,
     ):
+        """
+        Use skip_pt to avoid CUDA OOM when benchmarking with problem sizes
+        which are too large for the PT implementation.
+        """
+        # Can't skip PT computation if we are benchmarking it
+        assert not (benchmark_pt and skip_pt)
+
+        torch_dtype = string_to_torch_dtype(dtype)
         d = n // nheads
 
-        x = torch.randn(
-            batch_size, seqlen, n, device="cuda", dtype=dtype, requires_grad=True
-        )
-        Wqkv = torch.nn.Linear(nheads * d, 3 * nheads * d, device=device, dtype=dtype)
-
-        lengths = torch.tensor(
-            [seqlen] * batch_size, dtype=torch.int, device="cuda"
-        ).reshape(-1, 1)
-        attention_mask_bool = (
-            repeat(torch.arange(seqlen, device="cuda"), "s -> b s", b=batch_size)
-            < lengths
-        )
-        attention_mask = torch.zeros(batch_size, seqlen, device="cuda", dtype=dtype)
-        attention_mask = rearrange(attention_mask, "b s -> b 1 1 s")
+        with torch.no_grad():
+            x = torch.randn(
+                batch_size,
+                seqlen,
+                n,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            Wqkv = torch.nn.Linear(
+                nheads * d,
+                3 * nheads * d,
+                device=device,
+                dtype=torch_dtype,
+            )
 
-        x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
-            x, attention_mask_bool
-        )
-        qkv_unpad = (
-            rearrange(Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        qkv = (
-            rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
-            .detach()
-            .requires_grad_()
-        )
-        q, k, v = torch.split(qkv, 1, dim=2)
-        output = attention_ref(qkv, attention_mask_bool, dropout_p, causal=causal)
-        y_pt = output.detach()
+            if variable_seq_length_kv:
+                lengths_kv = torch.randint(0, seqlen + 1, size=(batch_size, 1))
+                lengths_kv = lengths_kv.to(device="cuda")
+            else:
+                lengths_kv = torch.tensor(
+                    [seqlen] * batch_size, dtype=torch.int, device="cuda"
+                ).reshape(-1, 1)
+
+            if variable_seq_length_q:
+                lengths_q = torch.randint(0, seqlen + 1, size=(batch_size, 1))
+                lengths_q = lengths_q.to(device="cuda")
+            else:
+                lengths_q = torch.tensor(
+                    [seqlen] * batch_size, dtype=torch.int, device="cuda"
+                ).reshape(-1, 1)
+
+            seq_range = torch.arange(seqlen, device="cuda")
+            attention_mask_bool_kv = (
+                seq_range.unsqueeze(0).expand((batch_size, seqlen)) < lengths_kv
+            ).unsqueeze(1)
+
+            attention_mask_bool_q = (
+                seq_range.unsqueeze(0).expand((batch_size, seqlen)) < lengths_q
+            ).unsqueeze(2)
+
+            x_unpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(
+                x, attention_mask_bool_kv
+            )
+            qkv_unpad = rearrange(
+                Wqkv(x_unpad), "nnz (t h d) -> nnz t h d", t=3, h=nheads
+            )
+            qkv = rearrange(Wqkv(x), "b s (t h d) -> b s t h d", t=3, h=nheads)
+            q, k, v = torch.split(qkv, 1, dim=2)
+            if not skip_pt:
+                y_pt = attention_ref(
+                    qkv, attention_mask_bool_kv, dropout_p, causal=causal
+                )
 
-        total, _, num_heads, head_size = qkv_unpad.shape
+            total, _, num_heads, head_size = qkv_unpad.shape
 
-        Q = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
-            name="q",
-            is_input=True,
-        )
-        K = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
-            name="k",
-            is_input=True,
-        )
-        V = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
-            name="v",
-            is_input=True,
-        )
-
-        flash_attention_op = ops.mem_eff_attention(
-            causal=causal,
-        )
-        if copy_op:
-            flash_attention_op = ops.mem_eff_attention(
-                **flash_attention_op._get_op_attributes()
+            Q = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="q",
+                is_input=True,
+            )
+            K = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="k",
+                is_input=True,
+            )
+            V = Tensor(
+                shape=[batch_size, num_heads, seqlen, head_size],
+                dtype=dtype,
+                name="v",
+                is_input=True,
             )
 
-        Y = flash_attention_op(Q, K, V)
-
-        Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
+            if variable_seq_length_kv:
+                L_kv = Tensor(
+                    shape=[batch_size, 1],
+                    dtype="int",
+                    name="lengths_kv",
+                    is_input=True,
+                )
+            if variable_seq_length_q:
+                L_q = Tensor(
+                    shape=[batch_size, 1],
+                    dtype="int",
+                    name="lengths_q",
+                    is_input=True,
+                )
 
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
+            mem_eff_attention_op = ops.mem_eff_attention(
+                causal=causal,
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                use_grouped_fmha=use_grouped_fmha,
+            )
+            if copy_op:
+                mem_eff_attention_op = ops.mem_eff_attention(
+                    **mem_eff_attention_op._get_op_attributes()
+                )
 
-        q = torch.permute(q, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
-        k = torch.permute(k, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
-        v = torch.permute(v, (0, 3, 2, 1, 4)).reshape(
-            batch_size, num_heads, seqlen, head_size
-        )
+            Y = mem_eff_attention_op(
+                Q,
+                K,
+                V,
+                L_kv if variable_seq_length_kv else None,
+                L_q if variable_seq_length_q else None,
+            )
 
-        inputs = {
-            "q": q.detach().half().cuda().contiguous(),
-            "k": k.detach().half().cuda().contiguous(),
-            "v": v.detach().half().cuda().contiguous(),
-        }
+            Y._attrs["is_output"] = True
+            Y._attrs["name"] = "output"
 
-        y = torch.empty([batch_size, seqlen, num_heads, head_size]).cuda().half()
-        module.run_with_tensors(inputs, [y])
+            if rebuild:
+                target = detect_target()
+                module = compile_model(Y, target, "./tmp", test_name)
+            else:
+                module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-        if benchmark_ait:
-            # Warm up.
-            for _ in range(5):
-                module.run_with_tensors(inputs, [y])
-            # Benchmark AIT
-            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-                inputs,
-                [y],
-                count=100,
+            q = torch.permute(q, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
+            )
+            k = torch.permute(k, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
             )
-            logger.info(
-                __file__, "benchmark eff-mem-attn time: {0}".format(time_per_iter_ms)
+            v = torch.permute(v, (0, 3, 2, 1, 4)).reshape(
+                batch_size, num_heads, seqlen, head_size
             )
 
-        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
+            inputs = {
+                "q": q.to(torch_dtype).contiguous(),
+                "k": k.to(torch_dtype).contiguous(),
+                "v": v.to(torch_dtype).contiguous(),
+            }
+            if variable_seq_length_kv:
+                inputs["lengths_kv"] = lengths_kv.to(torch.int).contiguous()
+            if variable_seq_length_q:
+                inputs["lengths_q"] = lengths_q.to(torch.int).contiguous()
+
+            y = torch.empty(
+                [batch_size, seqlen, num_heads, head_size],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
 
-        if benchmark_pt:
-            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+            ret = {}
 
-            func = attention_ref
-            args = (
-                qkv.cuda().half(),
-                attention_mask_bool.cuda(),
-                dropout_p,
-                False,
-                False,
-            )
-            duration = benchmark_torch_function(100, func, *args)
-            print(
-                f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
-            )
+            if benchmark_ait or benchmark_pt:
+                print(
+                    f"batch_size = {batch_size}, nheads = {nheads}, seqlen = {seqlen}, n = {n}, causal = {causal}, dtype = {dtype}"
+                )
+                print(
+                    f"variable_seq_length_kv = {variable_seq_length_kv}, variable_seq_length_q = {variable_seq_length_q}"
+                )
+
+            if benchmark_ait:
+                # Warm up.
+                for _ in range(5):
+                    module.run_with_tensors(inputs, [y])
+                # Benchmark AIT
+                time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                    inputs,
+                    [y],
+                    count=100,
+                )
+                print(
+                    f"AIT benchmark eff-mem-attn time per iter: {time_per_iter_ms:.2f}ms"
+                )
+
+                ret["ait_time_per_iter_ms"] = time_per_iter_ms
+
+            # y ~ [batch_size, seqlen, num_heads, head_size]
+            # attention_mask_bool_q ~ [batch_size, seqlen, 1]
+            if variable_seq_length_q:
+                y = y.masked_fill_(~attention_mask_bool_q.unsqueeze(2), 0.0)
+                if not skip_pt:
+                    y_pt = y_pt.masked_fill_(~attention_mask_bool_q.unsqueeze(2), 0.0)
 
-    def test_mem_eff_attention(self):
-        if detect_target().name() == "cuda":
-            for use_perm in [False, True]:
-                self._test_mem_eff_attention(
-                    use_perm=use_perm, test_name="mem_eff_attention"
+            if not skip_pt:
+                torch.testing.assert_close(
+                    y, y_pt.to(torch_dtype), atol=atol, rtol=rtol, equal_nan=True
                 )
-                self._test_mem_eff_attention(
-                    causal=True, test_name="mem_eff_attention_causal"
+
+            if benchmark_pt:
+                from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+                func = attention_ref
+                args = (
+                    qkv.to(torch_dtype).cuda(),
+                    attention_mask_bool_kv.cuda(),
+                    dropout_p,
+                    False,
+                    False,
                 )
-                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
-                # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
-                # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
+                duration = benchmark_torch_function(100, func, *args)
+                print(
+                    f"PT benchmark eff-mem-attn time per iter: {duration:.2f}ms, BS: {batch_size}, QPS: {batch_size / duration:.2f}"
+                )
+
+                ret["pt_time_per_iter_ms"] = duration
+
+        return ret
+
+    @parameterized.expand(
+        itertools.product(
+            filter_test_cases_by_params(
+                {
+                    TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                    TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                }
+            )["input"],
+            [False, True],  # variable_seq_length_kv
+            [False, True],  # variable_seq_length_q
+            [False, True],  # causal
+        ),
+        skip_on_empty=True,
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_mem_eff_attention(
+        self,
+        dtype: str,
+        variable_seq_length_kv: bool,
+        variable_seq_length_q: bool,
+        causal: bool,
+    ):
+        if dtype == "bfloat16":
+            atol = 1e-2
+            rtol = 1e-2
+        else:
+            atol = 1e-3
+            rtol = 1e-3
+        for use_grouped_fmha in [True, False]:
+            self._test_mem_eff_attention(
+                batch_size=16,
+                nheads=4,
+                seqlen=8,
+                n=80,
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                causal=causal,
+                use_grouped_fmha=use_grouped_fmha,
+                test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}_small",
+                dtype=dtype,
+                atol=atol,
+                rtol=rtol,
+            )
+            self._test_mem_eff_attention(
+                variable_seq_length_kv=variable_seq_length_kv,
+                variable_seq_length_q=variable_seq_length_q,
+                causal=causal,
+                test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}",
+                dtype=dtype,
+                atol=atol,
+                rtol=rtol,
+            )
+
+        # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=8, n=64, use_perm=use_perm, test_name="mem_eff_attention1")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=512, use_perm=use_perm, test_name="mem_eff_attention2")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=8, n=1024, use_perm=use_perm, test_name="mem_eff_attention3")
+        # self._test_mem_eff_attention(batch_size=16, nheads=8, seqlen=16, n=1024, use_perm=use_perm, test_name="mem_eff_attention4")
+        # self._test_mem_eff_attention(batch_size=1, nheads=8, seqlen=16, n=64, use_perm=use_perm, test_name="mem_eff_attention5")
+
+    @unittest.skip("Skip benchmarking in CI.")
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_mem_eff_attention_benchmark(
+        self,
+    ):
+        causal = False
+        res = []
+        for num_heads in [16]:
+            for batch_size, nheads, seqlen, n, skip_pt in (
+                # Larger head dimension
+                (1, num_heads, 1024, 1024, False),
+                (16, num_heads, 1024, 1024, False),
+                (64, num_heads, 1024, 1024, False),
+                (128, num_heads, 1024, 1024, False),
+                (1, num_heads, 1024, 2048, False),
+                (16, num_heads, 1024, 2048, False),
+                (64, num_heads, 1024, 2048, False),
+                (128, num_heads, 1024, 2048, False),
+                # Larger batch size
+                (1024, num_heads, 128, 512, True),
+                (1024, num_heads, 256, 512, True),
+                (1024, num_heads, 512, 512, True),
+                # Larger seq len
+                (128, num_heads, 1024, 512, True),
+                (128, num_heads, 2048, 512, True),
+                (128, num_heads, 4096, 512, True),
+            ):
+                for use_grouped_fmha in [True, False]:
+                    for dtype in ("float16", "float32"):
+                        for variable_seq_length_kv in [False, True]:
+                            for variable_seq_length_q in [False, True]:
+                                print("---------------------------------------------")
+                                run_res = self._test_mem_eff_attention(
+                                    batch_size=batch_size,
+                                    nheads=nheads,
+                                    seqlen=seqlen,
+                                    n=n,
+                                    variable_seq_length_kv=variable_seq_length_kv,
+                                    variable_seq_length_q=variable_seq_length_q,
+                                    causal=causal,
+                                    use_grouped_fmha=use_grouped_fmha,
+                                    benchmark_ait=True,
+                                    benchmark_pt=not skip_pt,
+                                    skip_pt=skip_pt,
+                                    test_name=f"mem_eff_attention_{dtype}_{causal}_{variable_seq_length_kv}_{variable_seq_length_q}_small",
+                                    dtype=dtype,
+                                )
+
+                                run_res.update(
+                                    {
+                                        "dtype": dtype,
+                                        "batch_size": batch_size,
+                                        "nheads": nheads,
+                                        "seqlen": seqlen,
+                                        "n": n,
+                                        "variable_seq_length_kv": variable_seq_length_kv,
+                                        "variable_seq_length_q": variable_seq_length_q,
+                                        "causal": causal,
+                                        "use_grouped_fmha": use_grouped_fmha,
+                                    }
+                                )
+                                res.append(run_res)
+                                print("Intermediate result:")
+                                print(res)
+            print("Final result:")
+            print(res)
+
+    @parameterized.expand(
+        itertools.product(
+            filter_test_cases_by_params(
+                {
+                    # Don't run this test on V100: the binary crashes
+                    # with 'misaligned address' error.
+                    TestEnv.CUDA_SM80: [("float16")],
+                }
+            )["input"],
+            [False, True],  # variable_seq_length_kv
+            [False, True],  # variable_seq_length_q
+        ),
+        skip_on_empty=True,
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    @unittest.expectedFailure
+    def test_mem_eff_attention_invalid_head_size(
+        self, dtype, variable_seq_length_kv, variable_seq_length_q
+    ):
+        self._test_mem_eff_attention(
+            batch_size=16,
+            nheads=8,
+            seqlen=8,
+            n=80,
+            variable_seq_length_kv=variable_seq_length_kv,
+            variable_seq_length_q=variable_seq_length_q,
+            test_name=f"mem_eff_attention_invalid_head_size_{dtype}_{variable_seq_length_kv}_{variable_seq_length_q}",
+            dtype=dtype,
+        )
 
     def _test_cross_attention(
         self,
@@ -505,69 +856,61 @@ def _test_cross_attention(
         head_size_v=64,
         dropout_p=0.0,
         causal=False,
-        dtype=torch.float16,
+        dtype="float16",
         device="cuda",
-        test_name="attention",
+        test_name="cross_attention",
         rebuild=True,
         benchmark_ait=False,
         benchmark_pt=False,
         copy_op=False,
+        cache_size=1,
+        atol=1e-3,
+        rtol=1e-3,
     ):
-        q = torch.randn(
-            batch_size,
-            seqlen,
-            num_heads,
-            head_size,
-            device="cuda",
-            dtype=dtype,
-        )
-        k = torch.randn(
-            batch_size,
-            seqlen_kv,
-            num_heads,
-            head_size,
-            device="cuda",
-            dtype=dtype,
-        )
-        v = torch.randn(
-            batch_size,
-            seqlen_kv,
-            num_heads,
-            head_size_v,
-            device="cuda",
-            dtype=dtype,
-        )
-
-        output = ref_cross_attention(q, k, v)
-        y_pt = output.detach()
+        torch_dtype = string_to_torch_dtype(dtype)
 
         Q = Tensor(
-            shape=[batch_size, num_heads, seqlen, head_size],
-            dtype="float16",
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_q"),
+                head_size,
+            ],
+            dtype=dtype,
             name="q",
             is_input=True,
         )
         K = Tensor(
-            shape=[batch_size, num_heads, seqlen_kv, head_size],
-            dtype="float16",
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_kv"),
+                head_size,
+            ],
+            dtype=dtype,
             name="k",
             is_input=True,
         )
         V = Tensor(
-            shape=[batch_size, num_heads, seqlen_kv, head_size_v],
-            dtype="float16",
+            shape=[
+                batch_size,
+                num_heads,
+                IntVar(values=[1, 1024], name="seq_kv"),
+                head_size_v,
+            ],
+            dtype=dtype,
             name="v",
             is_input=True,
         )
 
-        flash_attention_op = ops.mem_eff_attention(
+        mem_eff_attention_op = ops.mem_eff_attention(
             causal=causal,
         )
         if copy_op:
-            flash_attention_op = ops.flash_attention(
-                **flash_attention_op._get_op_attributes()
+            mem_eff_attention_op = ops.mem_eff_attention(
+                **mem_eff_attention_op._get_op_attributes()
             )
-        Y = flash_attention_op(Q, K, V)
+        Y = mem_eff_attention_op(Q, K, V)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
 
@@ -577,46 +920,97 @@ def _test_cross_attention(
         else:
             module = Model(os.path.join("./tmp", test_name, "test.so"))
 
-        q = torch.permute(q, (0, 2, 1, 3))
-        k = torch.permute(k, (0, 2, 1, 3))
-        v = torch.permute(v, (0, 2, 1, 3))
-
-        inputs = {
-            "q": q.detach().half().cuda().contiguous(),
-            "k": k.detach().half().cuda().contiguous(),
-            "v": v.detach().half().cuda().contiguous(),
-        }
-        y = torch.empty([batch_size, seqlen, num_heads, head_size_v]).cuda().half()
-        module.run_with_tensors(inputs, [y])
-
-        if benchmark_ait:
-            # Warm up.
-            for _ in range(5):
-                module.run_with_tensors(inputs, [y])
-            # Benchmark AIT
-            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
-                inputs,
-                [y],
-                count=100,
+        for i in range(cache_size):
+            q = torch.randn(
+                batch_size,
+                seqlen,
+                num_heads,
+                head_size,
+                device="cuda",
+                dtype=torch_dtype,
             )
-            logger.info(
-                __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
+            k = torch.randn(
+                batch_size,
+                seqlen_kv + i,
+                num_heads,
+                head_size,
+                device="cuda",
+                dtype=torch_dtype,
+            )
+            v = torch.randn(
+                batch_size,
+                seqlen_kv + i,
+                num_heads,
+                head_size_v,
+                device="cuda",
+                dtype=torch_dtype,
             )
 
-        self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
-
-    def test_cross_attention(self):
-        if detect_target().name() == "cuda":
-            self._test_cross_attention(test_name="cross_attention")
-            self._test_cross_attention(
-                seqlen=1024,
-                seqlen_kv=768,
-                head_size=64,
-                head_size_v=64,
-                test_name="cross_attention2",
+            y_pt = ref_cross_attention(q, k, v)
+
+            q = torch.permute(q, (0, 2, 1, 3))
+            k = torch.permute(k, (0, 2, 1, 3))
+            v = torch.permute(v, (0, 2, 1, 3))
+
+            inputs = {
+                "q": q.to(torch_dtype).contiguous(),
+                "k": k.to(torch_dtype).contiguous(),
+                "v": v.to(torch_dtype).contiguous(),
+            }
+            y = torch.empty(
+                [batch_size, seqlen, num_heads, head_size_v],
+                dtype=torch_dtype,
+                device="cuda",
             )
+            module.run_with_tensors(inputs, [y])
+
+            if benchmark_ait:
+                # Warm up.
+                for _ in range(5):
+                    module.run_with_tensors(inputs, [y])
+                # Benchmark AIT
+                time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                    inputs,
+                    [y],
+                    count=100,
+                )
+                _LOGGER.info(f"benchmark cross-attn time: {time_per_iter_ms}")
+
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_SM80: [("float16"), ("float32"), ("bfloat16")],
+            }
+        ),
+    )
+    def test_cross_attention(self, dtype):
+        if dtype == "bfloat16":
+            atol = 1e-2
+            rtol = 1e-2
+        else:
+            atol = 1e-3
+            rtol = 1e-3
+
+        self._test_cross_attention(
+            test_name=f"cross_attention_{dtype}",
+            dtype=dtype,
+            atol=atol,
+            rtol=rtol,
+        )
+        self._test_cross_attention(
+            seqlen=1024,
+            seqlen_kv=768,
+            head_size=64,
+            head_size_v=64,
+            test_name=f"cross_attention2_{dtype}",
+            cache_size=16,
+            dtype=dtype,
+            atol=atol,
+            rtol=rtol,
+        )
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_avg_pool2d.py b/tests/unittest/ops/test_avg_pool2d.py
index fc92d39f4..ad09576a2 100644
--- a/tests/unittest/ops/test_avg_pool2d.py
+++ b/tests/unittest/ops/test_avg_pool2d.py
@@ -19,15 +19,16 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class AvgPoolTestCase(unittest.TestCase):
-    def test_fp16(self):
-        target = detect_target()
+    def _test_avg_pool_2d(self, dtype="float16"):
         batch_size = [1, 3]
+        target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 7, 7, 2048],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -36,16 +37,24 @@ def test_fp16(self):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "avg_pool2d")
-        for b in batch_size:
-            X_pt = torch.randn(b, 2048, 7, 7).cuda().half()
+        for batch in batch_size:
+            X_pt = get_random_torch_tensor([batch, 2048, 7, 7], dtype=dtype)
             OP_pt = torch.nn.AvgPool2d(kernel_size=7, stride=1, padding=0)
             Y_pt = OP_pt(X_pt)
-            y = torch.empty([b, 1, 1, 2048]).cuda().half()
             x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute(0, 2, 3, 1).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_avg_pool_2d_fp16(self):
+        self._test_avg_pool_2d(dtype="float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_avg_pool_2d_fp32(self):
+        self._test_avg_pool_2d(dtype="float32")
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_b2b_bmm.py b/tests/unittest/ops/test_b2b_bmm.py
new file mode 100644
index 000000000..c645ab65b
--- /dev/null
+++ b/tests/unittest/ops/test_b2b_bmm.py
@@ -0,0 +1,790 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for b2b bmm Operators.
+"""
+import itertools
+import logging
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ClassicB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_classic_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="classic_b2b_bmm",
+        copy_op=True,
+        atol=1e-2,
+        rtol=1e-2,
+        use_fp16_acc=True,
+    ):
+        # Initialize AIT classic_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        Bias = Tensor(
+            shape=[batch_size_dim, m, n0],
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        classic_b2b_bmm_op = ops.classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            classic_b2b_bmm_op = ops.classic_b2b_bmm(
+                **classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            q_pt = torch.rand(batch_size, m, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, n1, dtype=torch_dtype).cuda()
+            bias_pt = torch.rand(batch_size, m, n0, dtype=torch_dtype).cuda()
+
+            # Run PT reference.
+            attn = alpha0 * (q_pt @ k_pt.transpose(-2, -1)) + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 / m * attn
+            invalid_attn_mask = get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            output = attn @ v_pt
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
+            y = torch.empty(
+                [batch_size, m, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    def test_classic_b2b_bmm_fp16_fp32acc(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_basic_fp32acc",
+            dtype="float16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_sigmoid_fp32acc",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            use_fp16_acc=False,
+        )
+
+    def test_classic_b2b_bmm_bf16_fp32acc(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_basic_fp32acc",
+            dtype="bfloat16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_sigmoid_fp32acc",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            use_fp16_acc=False,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            use_fp16_acc=False,
+        )
+
+    def test_classic_b2b_bmm_fp16(self):
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_causal",
+            dtype="float16",
+            batch_sizes=5,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_classic_b2b_bmm(
+            test_name="classic_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ClassicMultiheadB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_classic_multihead_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        m=256,
+        k0=128,
+        n0=256,
+        n1=256,
+        num_heads=2,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="classic_b2b_bmm",
+        copy_op=True,
+        atol=1e-2,
+        rtol=1e-2,
+        bias_broadcast=(False, False, False, False),
+        use_fp16_acc=True,
+    ):
+        # Initialize AIT classic_b2b_bmm operator.
+        assert len(bias_broadcast) == 4
+        assert (
+            bias_broadcast[3] is False
+        ), "Classic b2b bmm cannot broadcast bias on last dimension."
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+
+        Q = Tensor(
+            shape=[batch_size_dim, m, num_heads, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, n0, num_heads, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, n0, num_heads, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        bias_shape_full = [batch_size_dim, num_heads, m, n0]
+        bias_shape = [
+            IntImm(1) if bias_broadcast[i] else bias_shape_full[i] for i in range(4)
+        ]
+        Bias = Tensor(
+            shape=bias_shape,
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        classic_b2b_bmm_op = ops.classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            classic_b2b_bmm_op = ops.classic_b2b_bmm(
+                **classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size in batch_sizes:
+            # Initialize inputs
+            # Initialized in BMHD dim order
+            q_pt = torch.rand(batch_size, m, num_heads, k0, dtype=torch_dtype).cuda()
+            k_pt = torch.rand(batch_size, n0, num_heads, k0, dtype=torch_dtype).cuda()
+            v_pt = torch.rand(batch_size, n0, num_heads, n1, dtype=torch_dtype).cuda()
+            bias_shape_full_pt = (batch_size, num_heads, m, n0)
+            bias_shape_pt = (
+                1 if bias_broadcast[i] else bias_shape_full_pt[i] for i in range(4)
+            )
+            bias_pt = torch.rand(*bias_shape_pt, dtype=torch_dtype).cuda()
+
+            # Permute to BHMD dim order
+            q_pt_hf = torch.permute(q_pt, [0, 2, 1, 3])
+            k_pt_hf = torch.permute(k_pt, [0, 2, 1, 3])
+            v_pt_hf = torch.permute(v_pt, [0, 2, 1, 3])
+
+            # Run PT reference.
+            attn = alpha0 * (q_pt_hf @ k_pt_hf.transpose(-2, -1)) + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 / m * attn
+            invalid_attn_mask = get_attn_mask_per_causal_type(
+                m, n0, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            second_mm = attn @ v_pt_hf
+            output = torch.permute(
+                second_mm, [0, 2, 1, 3]
+            )  # permute back to original dim order
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {"q": q_pt, "k": k_pt, "v": v_pt, "bias": bias_pt}
+            y = torch.empty(
+                [batch_size, m, num_heads, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    def test_classic_multihead1_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+        )
+
+    def test_classic_multihead2_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+        )
+
+    def test_classic_multihead1_b2b_bmm_bias_broadcast1(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=1,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    def test_classic_multihead2_b2b_bmm_bias_broadcast1(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast1_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, False, False],
+        )
+
+    def test_classic_multihead2_b2b_bmm_bias_broadcast2(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast2_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+
+    def test_classic_multihead2_b2b_bmm_bias_broadcast3(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead2_b2b_bmm_broadcast3_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+            num_heads=2,
+            bias_broadcast=[True, False, False, False],
+        )
+
+    def test_classic_multihead4_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead4_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+            num_heads=4,
+        )
+
+    def test_classic_multihead16_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead16_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            num_heads=16,
+        )
+
+    def test_classic_multihead3_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead3_b2b_bmm_fp16_causal",
+            dtype="float16",
+            batch_sizes=5,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=3,
+        )
+
+    def test_classic_multihead8_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead8_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            num_heads=8,
+        )
+
+    def test_classic_multihead1_relu_b2b_bmm(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=1,
+        )
+
+    def test_classic_multihead_b2b_bmm_bf16(self):
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead8_b2b_bmm_bf16_sigmoid",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+            num_heads=8,
+            use_fp16_acc=False,
+        )
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead1_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            num_heads=1,
+            use_fp16_acc=False,
+        )
+        self._test_classic_multihead_b2b_bmm(
+            test_name="classic_multihead16_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            m=512,
+            n0=128,
+            n1=128,
+            num_heads=16,
+            use_fp16_acc=False,
+        )
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class FMHAStyleB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_fmha_style_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        seq_lens: Tuple[int, List[int]] = 256,
+        k0=128,
+        seq_lens_kv: Tuple[int, List[int]] = 256,
+        n1=256,
+        num_heads: Tuple[int, List[int]] = 1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        test_name="fmha_style_b2b_bmm",
+        copy_op=True,
+        atol=1e-3,
+        rtol=1e-2,
+        use_fp16_acc=True,
+    ):
+        # Initialize AIT fmha_style_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
+        if isinstance(seq_lens, int):
+            seq_lens = [seq_lens]
+        if isinstance(seq_lens_kv, int):
+            seq_lens_kv = [seq_lens_kv]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads]
+        alpha0 = 1.0 / (k0**0.5)
+        alpha1 = 1.0
+        batch_size_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
+        seq_lens_dim = shape_utils.gen_int_var_min_max(seq_lens, "seq_len")
+        seq_lens_kv_dim = shape_utils.gen_int_var_min_max(seq_lens_kv, "seq_len_kv")
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, "num_heads")
+
+        Q = Tensor(
+            shape=[batch_size_dim, seq_lens_dim, num_heads_dim, k0],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K = Tensor(
+            shape=[batch_size_dim, seq_lens_kv_dim, num_heads_dim, k0],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V = Tensor(
+            shape=[batch_size_dim, seq_lens_kv_dim, num_heads_dim, n1],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+
+        Bias = None
+        if has_bias:
+            shape = [batch_size_dim, num_heads_dim, seq_lens_dim, seq_lens_kv_dim]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        shape[i] = 1
+            Bias = Tensor(
+                shape=shape,
+                dtype=dtype,
+                name="bias",
+                is_input=True,
+            )
+        fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=alpha1,
+            alpha1_divide_by_seq_len=True,
+            epilogue_math_name=epilogue_math_name,
+        )
+
+        if copy_op:
+            fmha_style_b2b_bmm_op = ops.fmha_style_b2b_bmm(
+                **fmha_style_b2b_bmm_op._get_op_attributes()
+            )
+        Y = fmha_style_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        for batch_size, seq_len, seq_len_kv, num_head in itertools.product(
+            batch_sizes, seq_lens, seq_lens_kv, num_heads
+        ):
+            # Initialize inputs
+            q_pt = torch.rand(
+                batch_size, seq_len, num_head, k0, dtype=torch_dtype
+            ).cuda()
+            k_pt = torch.rand(
+                batch_size, seq_len_kv, num_head, k0, dtype=torch_dtype
+            ).cuda()
+            v_pt = torch.rand(
+                batch_size, seq_len_kv, num_head, n1, dtype=torch_dtype
+            ).cuda()
+            shape = [batch_size, num_head, seq_len, seq_len_kv]
+            if has_bias:
+                if bias_broadcast:
+                    for i, broadcast in enumerate(bias_broadcast):
+                        if broadcast:
+                            shape[i] = 1
+                bias_pt = torch.rand(shape, dtype=torch_dtype).cuda()
+            # Run PT reference.
+            attn = alpha0 * (
+                q_pt.transpose(1, 2) @ k_pt.transpose(1, 2).transpose(-2, -1)
+            )
+            if has_bias:
+                attn = attn + bias_pt
+            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+            attn = alpha1 / seq_len * attn
+            invalid_attn_mask = get_attn_mask_per_causal_type(
+                seq_len, seq_len_kv, causal_type, torch_dtype
+            )
+            attn = attn * invalid_attn_mask
+            output = (attn @ v_pt.transpose(1, 2)).transpose(1, 2)
+            y_pt = output.detach()
+
+            # Run AIT.
+            inputs = {
+                "q": q_pt,
+                "k": k_pt,
+                "v": v_pt,
+            }
+            if has_bias:
+                inputs["bias"] = bias_pt
+            y = torch.empty(
+                [batch_size, seq_len, num_head, n1],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y, y_pt.to(torch_dtype), atol=atol, rtol=rtol)
+
+    def test_fmha_style_b2b_bmm_fp16(self):
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len",
+            dtype="float16",
+            seq_lens=[128, 256],
+            # dynamic sequence length not supported by classic op
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_seq_len_kv",
+            dtype="float16",
+            seq_lens_kv=[128, 256],
+            # dynamic sequence length not supported by classic op
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_dynamic_num_heads",
+            dtype="float16",
+            num_heads=[1, 2],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_rectangular",
+            dtype="float16",
+            batch_sizes=[2],
+            seq_lens=512,
+            seq_lens_kv=128,
+            n1=128,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_causal_upper_right_empty",
+            dtype="float16",
+            batch_sizes=2,
+            causal_type=CausalType.UPPER_RIGHT_EMPTY,
+            # CausalType.UPPER_RIGHT_EMPTY not supported by classic op
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_causal_lower_left_empty",
+            dtype="float16",
+            batch_sizes=3,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias",
+            dtype="float16",
+            batch_sizes=2,
+            has_bias=True,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias_broadcast",
+            dtype="float16",
+            batch_sizes=3,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_bias_broadcast_relative_pos",
+            dtype="float16",
+            batch_sizes=[1, 11],
+            has_bias=True,
+            bias_broadcast=[True, True, False, False],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_multi_head",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            has_bias=True,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_complex",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="SiLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_fp16_complex_fp32_acc",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            use_fp16_acc=False,
+            seq_lens=512,
+            seq_lens_kv=512,
+        )
+
+    def test_fmha_style_b2b_bmm_bf16(self):
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_basic",
+            dtype="bfloat16",
+            batch_sizes=1,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_rectangular",
+            dtype="bfloat16",
+            batch_sizes=[2],
+            seq_lens=512,
+            seq_lens_kv=128,
+            n1=128,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_complex",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="SiLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+            use_fp16_acc=False,
+        )
+        self._test_fmha_style_b2b_bmm(
+            test_name="fmha_style_b2b_bmm_bf16_complex_fp32_acc",
+            dtype="bfloat16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            seq_lens=512,
+            seq_lens_kv=512,
+            use_fp16_acc=False,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_batch_gather.py b/tests/unittest/ops/test_batch_gather.py
index 9e90c1fbb..98067e309 100644
--- a/tests/unittest/ops/test_batch_gather.py
+++ b/tests/unittest/ops/test_batch_gather.py
@@ -22,23 +22,30 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class gatherTestCase(unittest.TestCase):
-    def _create_tensors(self, N):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _create_tensors(self, N, dtype):
         scores = torch.randperm(N) / N
-        return scores.cuda().half()
+        return scores.cuda().to(dtype=string_to_torch_dtype(dtype))
 
 
 class batchGatherTestCase(gatherTestCase):
-    def _create_tensors(self, N):
-        scores = torch.randperm(N) / N
-        return scores.cuda().half()
-
     def _test_batch_gather(
-        self, shape=(3, 2, 2), ind_shape=(3,), dim=0, max_ind=2, test_name="gather"
+        self,
+        shape=(3, 2, 2),
+        ind_shape=(3,),
+        dim=0,
+        max_ind=2,
+        test_name="gather",
+        dtype="float16",
     ):
-
         in_shape = shape
 
         o_shape = list(in_shape)
@@ -47,7 +54,7 @@ def _test_batch_gather(
 
         X1 = Tensor(
             shape=in_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -62,9 +69,9 @@ def _test_batch_gather(
         X4._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
 
-        input_x = torch.rand(in_shape).cuda().half()
+        input_x = get_random_torch_tensor(in_shape, dtype)
         init_index = torch.randint(max_ind, size=ind_shape, dtype=torch.int64).cuda()
 
         reshaped_shape = list(ind_shape)
@@ -80,10 +87,10 @@ def _test_batch_gather(
 
         indices = init_index.reshape(ind_shape).contiguous()
 
-        y = torch.empty(o_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors({"X": x, "indices": indices}, [y])
 
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
 
     def test_batch_gather(self):
         self._test_batch_gather(
@@ -103,37 +110,53 @@ def test_batch_gather(self):
             test_name="batch_gather4",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather(
+            shape=(8, 2, 2),
+            ind_shape=(2,),
+            dim=0,
+            max_ind=8,
+            test_name="batch_gather_f32",
+            dtype="float32",
+        )
+
 
 class batchGatherTopkTestCase(gatherTestCase):
     def _test_batch_gather_topk(
-        self, shape=(2, 2, 2), batch_size=1, N=1000, topK=100, test_name="topk"
+        self,
+        shape=(2, 2, 2),
+        batch_size=1,
+        N=1000,
+        topK=100,
+        test_name="topk",
+        dtype="float16",
     ):
-
         m_shape = (N,) + shape
         n_shape = (topK,) + shape
 
         X1 = Tensor(
             shape=m_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
         X2 = Tensor(
             shape=[N],
-            dtype="float16",
+            dtype=dtype,
             name="scores",
             is_input=True,
         )
-        X3 = ops.topk(k=topK)(X2)
+        _, X3 = ops.topk(k=topK)(X2)
         X4 = ops.batch_gather()(X1, X3)
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(X4, target, "./tmp", f"{test_name}_{self.test_count}")
 
-        input_x = torch.rand(m_shape).cuda().half()
-        scores = self._create_tensors(N)
+        input_x = get_random_torch_tensor(m_shape, dtype)
+        scores = self._create_tensors(N, dtype)
 
         (_, init_index) = torch.topk(scores, k=topK, dim=0)
 
@@ -150,7 +173,7 @@ def _test_batch_gather_topk(
 
         x_scores = scores.reshape((N,)).contiguous()
 
-        y = torch.empty(n_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors({"X": x, "scores": x_scores}, [y])
 
         self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
@@ -160,6 +183,16 @@ def test_batch_gather_topk(self):
             shape=(4, 1, 1), N=2000, topK=300, test_name="batch_gather_topk"
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCM.")
+    def test_float32(self):
+        self._test_batch_gather_topk(
+            shape=(4, 1, 1),
+            N=2000,
+            topK=300,
+            test_name="batch_gather_topk_f32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_batch_norm.py b/tests/unittest/ops/test_batch_norm.py
new file mode 100644
index 000000000..380f96a2f
--- /dev/null
+++ b/tests/unittest/ops/test_batch_norm.py
@@ -0,0 +1,116 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn import batch_norm
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class BatchnormTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(BatchnormTestCase, self).__init__(*args, **kwargs)
+        self.test_id = 0
+
+    def _test_batchnorm(
+        self,
+        num_features,
+        bn_op,
+        input_shape,
+        input_type="float16",
+        test_name="batch_norm",
+    ):
+        pt_op = getattr(torch.nn, bn_op)(num_features).cuda().half().eval()
+        ait_op = getattr(batch_norm, bn_op)(
+            num_features, eps=pt_op.eps, permute_input_output=True
+        )
+        ait_op.name_parameter_tensor()
+
+        pt_params = dict(pt_op.named_parameters())
+        pt_buffers = dict(pt_op.named_buffers())
+        params_ait = {}
+        for key, arr in pt_params.items():
+            print(key, arr.shape)
+            params_ait[key] = arr
+        for key, arr in pt_buffers.items():
+            print(key, arr.shape)
+            params_ait[key] = arr
+
+        X_pt = get_random_torch_tensor(input_shape, input_type)
+        Y_pt = pt_op(X_pt)
+        X_ait = Tensor(
+            shape=input_shape, dtype=input_type, name="input0", is_input=True
+        )
+        Y_ait = ait_op(X_ait)
+
+        Ys_ait = [var._attrs["values"][0] for var in Y_ait._attrs["shape"]]
+        self.assertEqual(list(Y_pt.shape), Ys_ait)
+
+        Y_ait._attrs["is_output"] = True
+        Y_ait._attrs["name"] = "output"
+
+        target = detect_target()
+        module = compile_model(
+            Y_ait,
+            target,
+            "./tmp",
+            f"{test_name}_{self.test_id}",
+            constants=params_ait,
+        )
+        self.test_id += 1
+
+        y = get_torch_empty_tensor(Ys_ait, dtype=input_type)
+        inputs = {"input0": X_pt}
+        module.run_with_tensors(inputs, [y])
+
+        print(f"PT output: {Y_pt=}")
+        print(f"AIT output: {y=}")
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-5, rtol=1e-5))
+
+    def test_batch_norm(self):
+        self._test_batchnorm(num_features=3, bn_op="BatchNorm1d", input_shape=[5, 3])
+        self._test_batchnorm(
+            num_features=3,
+            bn_op="BatchNorm1d",
+            input_shape=[5, 3, 234],
+            test_name="batch_norm_1d",
+        )
+        self._test_batchnorm(
+            num_features=3,
+            bn_op="BatchNorm2d",
+            input_shape=[1, 3, 244, 244],
+            test_name="batch_norm_2d",
+        )
+        self._test_batchnorm(
+            num_features=6,
+            bn_op="BatchNorm3d",
+            input_shape=[4, 6, 24, 24, 11],
+            test_name="batch_norm_3d",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
new file mode 100644
index 000000000..b07d5c7cd
--- /dev/null
+++ b/tests/unittest/ops/test_batched_dense_vec_jagged_2d_mul.py
@@ -0,0 +1,197 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for batched_dense_vec_jagged_2d_mul Operator.
+"""
+import unittest
+from typing import List
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import batched_dense_vec_jagged_2d_mul_ref
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import parameterized
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 3e-2, "rtol": 2e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class BatchedDenseVecJagged2DMulTestCase(unittest.TestCase):
+    def _test_batched_dense_vec_jagged_2d_mul(
+        self,
+        B: int,
+        N: int,
+        H: int,
+        D: int,
+        offsets: List[int],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_fp16_acc: bool = False,
+        test_suffix: str = "",
+    ):
+        # jagged shape is equal to (B, N, H, D)
+        batch_size = B
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        jagged_dims = [JaggedDim(min_value=0, max_value=N)]
+
+        total_length = offsets[-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+        jagged_inner_shape = [H, D]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dim = IntVar(values=[2, len(offsets) * 2])
+
+        # dense shape is (B, H, N)
+        dense_shape = [batch_size, H, N]
+        dense_dims = [batch_dim, IntImm(H), IntImm(N)]
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        DENSE = Tensor(
+            shape=dense_dims,
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        RESULT = ops.batched_dense_vec_jagged_2d_mul()(DENSE, JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert not RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_fp16_acc=use_fp16_acc),
+            "./tmp",
+            f"test_batched_dense_vec_jagged_2d_mul_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        dense_pt = get_random_torch_tensor(dense_shape, dtype)
+        result_pt = batched_dense_vec_jagged_2d_mul_ref(
+            vectors=dense_pt,
+            matrices=source_pt,
+            offsets=offsets_pt,
+        )
+        result = get_torch_empty_tensor([batch_size, H, D], dtype)
+
+        inputs = {"dense": dense_pt, "source": source_pt, "offsets": offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        torch.testing.assert_close(result, result_pt, **tolerance_limits)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                # TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_batched_dense_vesc_jagged_2d_mul(self, ait_dtype):
+        # test with different combination of offsets_dtype, use_fp16_acc and shapes
+        self._test_batched_dense_vec_jagged_2d_mul(
+            4,
+            260,
+            10,
+            32,
+            [0, 1, 4, 6, 7],
+            dtype=ait_dtype,
+            offsets_dtype="int32",
+            use_fp16_acc=True,
+            test_suffix=f"{ait_dtype}_int32_True",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            6,
+            130,
+            15,
+            39,
+            [0, 1, 4, 6, 7, 9, 10],
+            dtype=ait_dtype,
+            offsets_dtype="int32",
+            use_fp16_acc=False,
+            test_suffix=f"{ait_dtype}_int32_False",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            8,
+            52,
+            21,
+            32,
+            [0, 1, 4, 6, 7, 8, 12, 20, 29],
+            dtype=ait_dtype,
+            offsets_dtype="int64",
+            use_fp16_acc=True,
+            test_suffix=f"{ait_dtype}_int64_True",
+        )
+        self._test_batched_dense_vec_jagged_2d_mul(
+            10,
+            10,
+            32,
+            8,
+            [0, 1, 4, 6, 7, 11, 15, 19, 23, 26, 28],
+            dtype=ait_dtype,
+            offsets_dtype="int64",
+            use_fp16_acc=False,
+            test_suffix=f"{ait_dtype}_int64_False",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_bert_embeddings.py b/tests/unittest/ops/test_bert_embeddings.py
index 74ce3a027..064047cf5 100644
--- a/tests/unittest/ops/test_bert_embeddings.py
+++ b/tests/unittest/ops/test_bert_embeddings.py
@@ -20,9 +20,15 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
-def get_ait_inputs(batch_size=1, seq_len=512, dtype="int64"):
+def get_ait_inputs(
+    batch_size=1,
+    seq_len=512,
+    dtype="int64",
+):
     input_ids = Tensor(
         shape=[batch_size, seq_len],
         name="input_ids",
@@ -45,7 +51,11 @@ def get_ait_inputs(batch_size=1, seq_len=512, dtype="int64"):
 
 
 def get_ait_params(
-    hidden_size, vocab_size, max_position_embeddings, type_vocab_size, dtype="float16"
+    hidden_size,
+    vocab_size,
+    max_position_embeddings,
+    type_vocab_size,
+    dtype="float16",
 ):
     word_embeddings = Tensor(
         shape=[vocab_size, hidden_size],
@@ -93,14 +103,21 @@ def _test_bert_embeddings(
         vocab_size,
         max_position_embeddings,
         type_vocab_size,
+        test_name="bert_embeddings",
         indices_type="int64",
+        input_type="float16",
     ):
-        inputs = get_ait_inputs(batch_size, seq_len, indices_type)
+        inputs = get_ait_inputs(
+            batch_size,
+            seq_len,
+            dtype=indices_type,
+        )
         params = get_ait_params(
             hidden_size,
             vocab_size,
             max_position_embeddings,
             type_vocab_size,
+            dtype=input_type,
         )
         y = ops.bert_embeddings()(*(inputs + params), 1e-5)
         y._attrs["is_output"] = True
@@ -108,17 +125,27 @@ def _test_bert_embeddings(
 
         target = detect_target()
         with compile_model(
-            y, target, "./tmp", f"test_bert_embeddings_{self._test_id}"
+            y,
+            target,
+            "./tmp",
+            f"{test_name}_{self._test_id}",
         ) as module:
-            dtype = torch.long
+            self._test_id += 1
+            torch_indices_type = string_to_torch_dtype(indices_type)
             input_ids = torch.randint(
-                0, vocab_size, (batch_size, seq_len), dtype=dtype
+                0,
+                vocab_size,
+                (batch_size, seq_len),
+                dtype=torch_indices_type,
             ).cuda()
             token_type_ids = torch.randint(
-                0, type_vocab_size, input_ids.size(), dtype=dtype
+                0,
+                type_vocab_size,
+                input_ids.size(),
+                dtype=torch_indices_type,
             ).cuda()
             position_ids = (
-                torch.arange(seq_len, dtype=dtype)
+                torch.arange(seq_len, dtype=torch_indices_type)
                 .reshape((1, -1))
                 .expand(batch_size, -1)
                 .contiguous()
@@ -132,7 +159,7 @@ def _test_bert_embeddings(
             for param in params:
                 name = param._attrs["name"]
                 shape = [shape.value() for shape in param.shape()]
-                w = torch.randn(shape).cuda().half()
+                w = get_random_torch_tensor(shape, dtype=input_type)
                 inputs[name] = w
 
             word_embedding = torch.nn.functional.embedding(
@@ -150,18 +177,79 @@ def _test_bert_embeddings(
                 pt_embedding, [hidden_size], inputs["gamma"], inputs["beta"], eps=1e-5
             )
 
-            embedding = torch.empty(pt_embedding.shape).cuda().half()
+            embedding = torch.empty_like(pt_embedding)
             module.run_with_tensors(inputs, [embedding])
             self.assertTrue(
                 torch.allclose(embedding, pt_embedding, atol=1e-2, rtol=1e-2)
             )
 
-    def test_bert_embeddings(self):
+    def test_bert_embeddings_fp16(self):
+        if detect_target().name() != "rocm":
+            self._test_bert_embeddings(
+                batch_size=15,
+                seq_len=17,
+                hidden_size=264,
+                vocab_size=10000,
+                max_position_embeddings=512,
+                type_vocab_size=2,
+                test_name="bert_embeddings_fp16",
+                input_type="float16",
+            )
+            self._test_bert_embeddings(
+                batch_size=1,
+                seq_len=13,
+                hidden_size=264,
+                vocab_size=10000,
+                max_position_embeddings=512,
+                type_vocab_size=2,
+                test_name="bert_embeddings_fp16",
+                input_type="float16",
+            )
+        self._test_bert_embeddings(
+            batch_size=8,
+            seq_len=512,
+            hidden_size=512,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp16",
+            input_type="float16",
+        )
+
+    def test_bert_embeddings_fp32(self):
         if detect_target().name() != "rocm":
-            self._test_bert_embeddings(15, 17, 264, 10000, 512, 2)
-            self._test_bert_embeddings(1, 13, 264, 10000, 512, 2)
-        self._test_bert_embeddings(8, 512, 512, 10000, 512, 2)
+            self._test_bert_embeddings(
+                batch_size=15,
+                seq_len=17,
+                hidden_size=264,
+                vocab_size=10000,
+                max_position_embeddings=512,
+                type_vocab_size=2,
+                test_name="bert_embeddings_fp32",
+                input_type="float32",
+            )
+            self._test_bert_embeddings(
+                batch_size=1,
+                seq_len=13,
+                hidden_size=264,
+                vocab_size=10000,
+                max_position_embeddings=512,
+                type_vocab_size=2,
+                test_name="bert_embeddings_fp32",
+                input_type="float32",
+            )
+        self._test_bert_embeddings(
+            batch_size=8,
+            seq_len=512,
+            hidden_size=512,
+            vocab_size=10000,
+            max_position_embeddings=512,
+            type_vocab_size=2,
+            test_name="bert_embeddings_fp32",
+            input_type="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm.py b/tests/unittest/ops/test_bmm.py
index 0c14c348c..b1d54742f 100644
--- a/tests/unittest/ops/test_bmm.py
+++ b/tests/unittest/ops/test_bmm.py
@@ -20,34 +20,38 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 class BMMTestCase(unittest.TestCase):
-    def _test_rcr(self, bs, ms, N, K, test_name):
+    def _test_rcr(self, bs, ms, N, K, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             WT = torch.transpose(W_pt, 2, 1)
             Y_pt = torch.bmm(X_pt, WT)
 
-            y = torch.empty([b, m, N]).cuda().half()
+            y = get_torch_empty_tensor([b, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or Y_pt.nelement() == 0:
                 pass
@@ -56,27 +60,25 @@ def _test_rcr(self, bs, ms, N, K, test_name):
 
     def test_rcr(self):
         self._test_rcr([1024], [128], N=512, K=256, test_name="static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
-            self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
-            self._test_rcr(
-                [1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm"
-            )
-            self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
-            self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
-            self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+        self._test_rcr([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcr([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcr([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcr([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcr([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcr([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcr_rocm(self):
+        self._test_rcr([1024], [128], N=512, K=256, test_name="static")
 
-    def _test_crr(self, bs, ks, test_name):
-        M = 256
-        N = 512
+    def _test_crr(self, bs, ks, M, N, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         k_dim = shape_utils.gen_int_var_min_max(ks, name="k")
         X = Tensor(
-            shape=[batch_dim, k_dim, M], dtype="float16", name="input_0", is_input=True
+            shape=[batch_dim, k_dim, M], dtype=dtype, name="input_0", is_input=True
         )
         W = Tensor(
-            shape=[batch_dim, k_dim, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, k_dim, N], dtype=dtype, name="input_1", is_input=True
         )
         OP = ops.bmm_crr()
         Y = OP(X, W)
@@ -84,105 +86,700 @@ def _test_crr(self, bs, ks, test_name):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        for (b, k) in itertools.product(bs, ks):
-            X_pt = torch.randn(b, k, M).cuda().half()
-            W_pt = torch.randn(b, k, N).cuda().half()
+        for b, k in itertools.product(bs, ks):
+            X_pt = get_random_torch_tensor([b, k, M], dtype)
+            W_pt = get_random_torch_tensor([b, k, N], dtype)
 
             XT = torch.transpose(X_pt, 2, 1)
             Y_pt = torch.bmm(XT, W_pt)
 
-            y = torch.empty([b, M, N]).cuda().half()
+            y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def test_crr(self):
-        self._test_crr([1024], [128], "static")
-        if detect_target().name() == "cuda":
-            self._test_crr([3, 977, 1024], [128], "dynamic_b")
-            self._test_crr([5], [45, 56, 78], "dynamic_k")
-            self._test_crr([1, 2, 5], [3, 6, 8], "dynamic_bk")
-
-    def _test_rrr(self, bs, ms, test_name):
-        K = 256
-        N = 512
+        self._test_crr([1024], [128], M=256, N=512, test_name="static")
+        self._test_crr([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crr([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crr([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crr_rocm(self):
+        self._test_crr([1024], [128], M=256, N=512, test_name="static")
+
+    def _test_rrr(self, bs, ms, K, N, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_pt = torch.bmm(X_pt, W_pt)
 
-            y = torch.empty([b, m, N]).cuda().half()
+            y = get_torch_empty_tensor([b, m, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def test_rrr(self):
-        self._test_rrr([87], [23], "static")
-        if detect_target().name() == "cuda":
-            self._test_rrr([2, 5, 99], [23], "dynamic_b")
-            self._test_rrr([77], [4, 7, 9], "dynamic_m")
-            self._test_rrr([2, 5, 7], [1, 7, 9], "dynamic_bm")
-
-    def _test_ccr(self, bs, test_name):
-        M = 256
-        N = 64
-        K = 128
+        self._test_rrr([87], [23], K=256, N=512, test_name="static")
+        self._test_rrr([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrr([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrr([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrr_rocm(self):
+        self._test_rrr([87], [23], K=256, N=512, test_name="static")
+
+    def _test_ccr(self, bs, M, N, K, test_name, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        X = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_ccr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
+
+        for b in bs:
+            X_pt = get_random_torch_tensor([b, K, M], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+            y = get_torch_empty_tensor([b, M, N], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_ccr(self):
+        self._test_ccr([77], M=256, N=64, K=128, test_name="static")
+        self._test_ccr([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccr_rocm(self):
+        self._test_ccr([77], M=256, N=64, K=128, test_name="static")
+
+    def _test_rcc(self, bs, ms, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, K, M], dtype="float16", name="input_0", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
+
+            WT = torch.transpose(W_pt, 2, 1)
+            Y_pt = torch.bmm(X_pt, WT)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, m], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            if X_pt.nelement() == 0 or Y_pt.nelement() == 0:
+                pass
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rcc(self):
+        self._test_rcc([1024], [128], N=512, K=256, test_name="static")
+        self._test_rcc([1, 5, 977, 1024], [32], N=512, K=256, test_name="dynamic_b")
+        self._test_rcc([1], [100, 200, 300], N=512, K=256, test_name="dynamic_m")
+        self._test_rcc([1, 2, 5], [100, 200, 300], N=512, K=256, test_name="dynamic_bm")
+        self._test_rcc([0], [128], N=512, K=256, test_name="zero_batch")
+        self._test_rcc([1], [128], N=512, K=0, test_name="zero_k")
+        self._test_rcc([1], [128], N=0, K=8, test_name="zero_n")
+
+    def test_rcc_rocm(self):
+        self._test_rcc([1024], [128], N=512, K=256, test_name="static")
+
+    def _test_crc(self, bs, ks, M, N, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        k_dim = shape_utils.gen_int_var_min_max(ks, name="k")
+        X = Tensor(
+            shape=[batch_dim, k_dim, M], dtype=dtype, name="input_0", is_input=True
         )
         W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, k_dim, N], dtype=dtype, name="input_1", is_input=True
         )
-        OP = ops.bmm_ccr()
+        OP = ops.bmm_crc()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        for b, k in itertools.product(bs, ks):
+            X_pt = get_random_torch_tensor([b, k, M], dtype)
+            W_pt = get_random_torch_tensor([b, k, N], dtype)
+
+            XT = torch.transpose(X_pt, 2, 1)
+            Y_pt = torch.bmm(XT, W_pt)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, M], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_crc(self):
+        self._test_crc([1024], [128], M=256, N=512, test_name="static")
+        self._test_crc([3, 977, 1024], [128], M=256, N=512, test_name="dynamic_b")
+        self._test_crc([5], [45, 56, 78], M=256, N=512, test_name="dynamic_k")
+        self._test_crc([1, 2, 5], [3, 6, 8], M=256, N=512, test_name="dynamic_bk")
+
+    def test_crc_rocm(self):
+        self._test_crc([1024], [128], M=256, N=512, test_name="static")
+
+    def _test_rrc(self, bs, ms, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrc_{}".format(test_name))
+
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
+
+            Y_pt = torch.bmm(X_pt, W_pt)
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+
+            y = get_torch_empty_tensor([b, N, m], dtype)
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_rrc(self):
+        self._test_rrc([87], [23], K=256, N=512, test_name="static")
+        self._test_rrc([2, 5, 99], [23], K=128, N=512, test_name="dynamic_b")
+        self._test_rrc([77], [4, 7, 9], K=8, N=512, test_name="dynamic_m")
+        self._test_rrc([2, 5, 7], [1, 7, 9], K=256, N=512, test_name="dynamic_bm")
+
+    def test_rrc_rocm(self):
+        self._test_rrc([87], [23], K=256, N=512, test_name="static")
+
+    def _test_ccc(self, bs, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
+        X = Tensor(shape=[batch_dim, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_ccc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
 
         for b in bs:
-            X_pt = torch.randn(b, K, M).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor([b, K, M], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             XT = torch.transpose(X_pt, 2, 1)
             Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
-            y = torch.empty([b, M, N]).cuda().half()
+            Y_pt = torch.transpose(Y_pt, 2, 1)
+            y = get_torch_empty_tensor([b, N, M], dtype)
+            # y = get_torch_empty_tensor([b, M, N], dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_ccr(self):
-        self._test_ccr([77], "static")
-        if detect_target().name() == "cuda":
-            self._test_ccr([1, 9, 101], "dynamic_b")
+    def test_ccc(self):
+        self._test_ccc([77], M=256, N=64, K=128, test_name="static")
+        self._test_ccc([1, 9, 101], M=256, N=64, K=128, test_name="dynamic_b")
+
+    def test_ccc_rocm(self):
+        self._test_ccc([77], M=256, N=64, K=128, test_name="static")
+
+    def test_bmm_0_fp32_sm80(self, dtype="float32"):
+        self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccr(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_0_bf16(self, dtype="bfloat16"):
+        self._test_rcr([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccr(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcc(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccc(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_bmm_1_bf16(self, dtype="bfloat16"):
+        self._test_rcc([128], [64], N=8, K=64, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcc(
+            [1, 5, 77, 128],
+            [32],
+            N=16,
+            K=64,
+            test_name=f"dynamic_b_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            [1, 2, 5],
+            [3, 6, 8],
+            M=24,
+            N=64,
+            test_name=f"dynamic_bk_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [8], [4, 7, 9], K=64, N=32, test_name=f"dynamic_m_{dtype}", dtype=dtype
+        )
+        self._test_ccc(
+            [1, 9, 11], M=64, N=32, K=16, test_name=f"dynamic_b_{dtype}", dtype=dtype
+        )
+
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    K=60,
+                    N=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=60,
+                N=28,
+                test_name="dynamic_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    N=60,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=60,
+                K=28,
+                test_name="dynamic_bm_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccr(
+                    bs=[1, 5, 11],
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=60,
+                N=7,
+                K=28,
+                test_name="dynamic_b_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccr(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crr(
+                    bs=[1, 2, 5],
+                    ks=[3, 6, 8],
+                    M=28,
+                    N=60,
+                    test_name="dynamic_bk_fp16_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=28,
+                N=60,
+                test_name="dynamic_bk_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crr(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rrc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrc(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    K=60,
+                    N=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=60,
+                N=28,
+                test_name="dynamic_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                K=64,
+                N=32,
+                test_name="dynamic_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcc(
+                    bs=[2, 5, 7],
+                    ms=[1, 7, 9],
+                    N=60,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=60,
+                K=28,
+                test_name="dynamic_bm_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcc(
+                bs=[2, 5, 7],
+                ms=[1, 7, 9],
+                N=64,
+                K=32,
+                test_name="dynamic_bm_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccc(
+                    bs=[1, 5, 11],
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=60,
+                N=7,
+                K=28,
+                test_name="dynamic_b_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccc(
+                bs=[1, 5, 11],
+                M=64,
+                N=7,
+                K=32,
+                test_name="dynamic_b_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crc(
+                    bs=[1, 2, 5],
+                    ks=[3, 6, 8],
+                    M=28,
+                    N=60,
+                    test_name="dynamic_bk_fp16_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=28,
+                N=60,
+                test_name="dynamic_bk_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crc(
+                bs=[1, 2, 5],
+                ks=[3, 6, 8],
+                M=32,
+                N=64,
+                test_name="dynamic_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
-    def test_rcr_with_accessors(self):
+    def _test_rcr_with_accessors(self, dtype="float16"):
         A_shape = [2, 2, 4]
         B_shape = [2, 8, 4]
         C_shape = [2, 2, 8]
 
-        X_expanded = Tensor(
-            shape=A_shape, dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        C = Tensor(shape=C_shape, dtype="float16", name="input_2", is_input=True)
+        X_expanded = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        C = Tensor(shape=C_shape, dtype=dtype, name="input_2", is_input=True)
 
         X, _ = ops.split()(X_expanded, [1, 1], 0)
         OP = ops.bmm_rcr()
@@ -191,12 +788,13 @@ def test_rcr_with_accessors(self):
         out._attrs["name"] = "output_0"
         out._attrs["is_output"] = True
 
+        test_name = f"bmm_rcr_with_accessor_{dtype}"
         target = detect_target()
-        module = compile_model(out, target, "./tmp", "bmm_rcr_with_accessor")
+        module = compile_model(out, target, "./tmp", test_name)
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        C_pt = torch.randn(*C_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        C_pt = get_random_torch_tensor(C_shape, dtype)
 
         X_feed, _ = torch.split(X_pt, [1, 1], 0)
 
@@ -209,20 +807,16 @@ def test_rcr_with_accessors(self):
         inputs[input_name_to_index["input_0"]] = X_pt
         inputs[input_name_to_index["input_1"]] = W_pt
         inputs[input_name_to_index["input_2"]] = C_pt
-        y = torch.empty([4, 2, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 2, 8], dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rcr_merge_with_accessors(self):
+    def _test_rcr_merge_with_accessors(self, dtype="float16"):
         A_shape = [2, 2, 4]
         B_shape = [4, 8, 4]
 
-        X_expanded = Tensor(
-            shape=A_shape, dtype="float16", name="input_0", is_input=True
-        )
-        W_expanded = Tensor(
-            shape=B_shape, dtype="float16", name="input_1", is_input=True
-        )
+        X_expanded = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W_expanded = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
 
         X1, X2 = ops.split()(X_expanded, [1, 1], 0)
         W1, W2 = ops.split()(W_expanded, [2, 2], 0)
@@ -233,10 +827,11 @@ def test_rcr_merge_with_accessors(self):
         out._attrs["is_output"] = True
 
         target = detect_target()
-        module = compile_model(out, target, "./tmp", "bmm_rcr_merge_with_accessor")
+        test_name = f"bmm_rcr_merge_with_accessor_{dtype}"
+        module = compile_model(out, target, "./tmp", test_name)
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         X1_pt, X2_pt = torch.split(X_pt, [1, 1], 0)
 
@@ -250,11 +845,15 @@ def test_rcr_merge_with_accessors(self):
         inputs = [0, 0]
         inputs[input_name_to_index["input_0"]] = X_pt
         inputs[input_name_to_index["input_1"]] = W_pt
-        y = torch.empty([4, 2, 8]).cuda().half()
+        y = get_torch_empty_tensor([4, 2, 8], dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(out_pt, y, atol=1e-1, rtol=1e-1))
 
-    def _test_rcr(self, A_shape, B_shape, test_name):
+    def test_with_accessors(self):
+        self._test_rcr_with_accessors()
+        self._test_rcr_merge_with_accessors()
+
+    def _test_rcr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-2]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -263,8 +862,8 @@ def _test_rcr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -273,13 +872,13 @@ def _test_rcr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(X_pt, WT)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -289,7 +888,7 @@ def test_rcr(self):
         self._test_rcr([16, 8], [8, 32, 8], "2d_broadcastable_a")
         self._test_rcr([8, 16, 8], [32, 8], "2d_broadcastable_b")
 
-    def _test_crr(self, A_shape, B_shape, test_name):
+    def _test_crr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-1]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -298,8 +897,8 @@ def _test_crr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_crr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -308,13 +907,13 @@ def _test_crr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         Y_pt = torch.matmul(XT, W_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -324,7 +923,7 @@ def test_crr(self):
         self._test_crr([8, 16], [8, 8, 32], "2d_broadcastable_a")
         self._test_crr([8, 8, 16], [8, 32], "2d_broadcastable_b")
 
-    def _test_rrr(self, A_shape, B_shape, test_name):
+    def _test_rrr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-1]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -333,8 +932,8 @@ def _test_rrr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -343,12 +942,12 @@ def _test_rrr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         Y_pt = torch.matmul(X_pt, W_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -358,7 +957,7 @@ def test_rrr(self):
         self._test_rrr([16, 8], [8, 8, 32], "2d_broadcastable_a")
         self._test_rrr([8, 16, 8], [8, 32], "2d_broadcastable_b")
 
-    def _test_ccr(self, A_shape, B_shape, test_name):
+    def _test_ccr(self, A_shape, B_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-2]
         if len(A_shape) == 2:
             B = B_shape[0]
@@ -367,8 +966,8 @@ def _test_ccr(self, A_shape, B_shape, test_name):
         else:
             B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_ccr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -377,14 +976,14 @@ def _test_ccr(self, A_shape, B_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(XT, WT)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -394,6 +993,278 @@ def test_ccr(self):
         self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
         self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
 
+    def _test_rcc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcc(self):
+        self._test_rcc([1, 16, 8], [2, 32, 8], "broadcastable_a")
+        self._test_rcc([2, 16, 8], [1, 32, 8], "broadcastable_b")
+        self._test_rcc([16, 8], [8, 32, 8], "2d_broadcastable_a")
+        self._test_rcc([8, 16, 8], [32, 8], "2d_broadcastable_b")
+
+    def _test_crc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_crc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_crc(self):
+        self._test_crc([1, 8, 16], [2, 8, 32], "broadcastable_a")
+        self._test_crc([2, 8, 16], [1, 8, 32], "broadcastable_b")
+        self._test_crc([8, 16], [8, 8, 32], "2d_broadcastable_a")
+        self._test_crc([8, 8, 16], [8, 32], "2d_broadcastable_b")
+
+    def _test_rrc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-1]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        Y_pt = torch.matmul(X_pt, W_pt)
+        Y_pt = Y_pt.transpose(-2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrc(self):
+        self._test_rrc([1, 16, 8], [2, 8, 32], "broadcastable_a")
+        self._test_rrc([2, 16, 8], [1, 8, 32], "broadcastable_b")
+        self._test_rrc([16, 8], [8, 8, 32], "2d_broadcastable_a")
+        self._test_rrc([8, 16, 8], [8, 32], "2d_broadcastable_b")
+
+    def _test_ccc(self, A_shape, B_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-2]
+        if len(A_shape) == 2:
+            B = B_shape[0]
+        elif len(B_shape) == 2:
+            B = A_shape[0]
+        else:
+            B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_ccc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT)
+        Y_pt = torch.transpose(Y_pt, -2, -1)
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccc(self):
+        self._test_ccr([1, 8, 16], [2, 32, 8], "broadcastable_a")
+        self._test_ccr([2, 8, 16], [1, 32, 8], "broadcastable_b")
+        self._test_ccr([8, 16], [8, 32, 8], "2d_broadcastable_a")
+        self._test_ccr([8, 8, 16], [32, 8], "2d_broadcastable_b")
+
+    def test_bmm_broadcast_0_fp32_sm80(self, dtype="float32"):
+        self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_0_bf16(self, dtype="bfloat16"):
+        self._test_rcr([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcr([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crr([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrr([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccr([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_fp32_sm80(self, dtype="float32"):
+        self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_bmm_broadcast_1_bf16(self, dtype="bfloat16"):
+        self._test_rcc([2, 16, 8], [1, 32, 8], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rcc([16, 8], [8, 32, 8], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([1, 8, 16], [2, 8, 32], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_crc([8, 8, 16], [8, 32], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([2, 16, 8], [1, 8, 32], f"broadcastable_b_{dtype}", dtype=dtype)
+        self._test_rrc([16, 8], [8, 8, 32], f"2d_broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([1, 8, 16], [2, 32, 8], f"broadcastable_a_{dtype}", dtype=dtype)
+        self._test_ccc([8, 8, 16], [32, 8], f"2d_broadcastable_b_{dtype}", dtype=dtype)
+
+    def test_rcr_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, 8, K], dtype)
+        y = get_torch_empty_tensor([2, 10, 8], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be incompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
+
+    def test_rrr_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rrr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, K, 8], dtype)
+        y = get_torch_empty_tensor([2, 10, 8], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be incompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
+
+    def test_rcc_fail(self, dtype="float16"):
+        target = detect_target()
+        batch_dim = shape_utils.gen_int_var_min_max([1, 16], name="batch_size")
+        m_dim = shape_utils.gen_int_var_min_max([1, 10], name="m")
+        K = 3
+        N = 8
+        X = Tensor(
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
+        )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
+        OP = ops.bmm_rcc()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_should_fail")
+
+        X_pt = get_random_torch_tensor([2, 10, K], dtype)
+        W_pt = get_random_torch_tensor([16, 8, K], dtype)
+        y = get_torch_empty_tensor([2, 8, 10], dtype)
+
+        try:
+            module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
+            raise AssertionError(
+                "Shouldn't be able to run be incompatible tensor shape!"
+            )
+        except RuntimeError:
+            pass
+
+
+filter_test_cases_by_test_env(BMMTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_add.py b/tests/unittest/ops/test_bmm_add.py
index e6902e099..231e7af24 100644
--- a/tests/unittest/ops/test_bmm_add.py
+++ b/tests/unittest/ops/test_bmm_add.py
@@ -19,55 +19,96 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 class BMMAddTestCase(unittest.TestCase):
-    def test_rrr(self):
-        B = 32
-        M = 256
-        K = 256
-        N = 512
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(BMMAddTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
-        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.bmm_rrr_add()
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "bmm_rrr_add")
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         Y_pt = torch.bmm(X_pt, W_pt)
         Y_pt = Y_pt + D_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
         )
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
 
-    def _test_ccr(self, B, M, N, K, test_name):
+    def _test_ccr(self, B, M, N, K, test_name, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
-        D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.bmm_ccr_add()
         Y = OP(X, W, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        D_pt = torch.randn(B, M, N).cuda().half()
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         XT = torch.transpose(X_pt, 2, 1)
         Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
         Y_pt = Y_pt + D_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_rcr(self, B, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rcr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
+
+        Y_pt = torch.bmm(X_pt, W_pt.transpose(2, 1))
+        Y_pt = Y_pt + D_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
         )
@@ -75,6 +116,175 @@ def _test_ccr(self, B, M, N, K, test_name):
             pass
         else:
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_crr(self, B, M, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[B, M, N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.bmm_crr_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, M, N], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt + D_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_rcc(self, B, M, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rcc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        WT = W_pt.transpose(2, 1)
+        Y_pt = torch.bmm(X_pt, WT)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_rrc(self, B, M, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_rrc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        Y_pt = torch.bmm(X_pt, W_pt)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        self.test_count += 1
+
+    def _test_crc(self, B, M, K, N, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        D = Tensor(
+            shape=[B, N, M],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.bmm_crc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt)
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def _test_ccc(self, B, M, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        D = Tensor(shape=[B, N, M], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.bmm_ccc_add()
+        Y = OP(X, W, D)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
+        X_pt = get_random_torch_tensor([B, K, M], dtype)
+        W_pt = get_random_torch_tensor([B, N, K], dtype)
+        D_pt = get_random_torch_tensor([B, N, M], dtype)
+
+        XT = torch.transpose(X_pt, 2, 1)
+        Y_pt = torch.bmm(XT, W_pt.transpose(2, 1))
+        Y_pt = Y_pt.transpose(2, 1) + D_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": D_pt}, [y]
+        )
+        if X_pt.nelement() == 0 or W_pt.nelement == 0:
+            pass
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
+
+    def test_rrr(self):
+        self._test_rrr(B=32, M=256, K=256, N=512, test_name="bmm_rrr_add")
 
     @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_ccr(self):
@@ -83,63 +293,539 @@ def test_ccr(self):
         self._test_ccr(B=1, M=0, N=256, K=512, test_name="bmm_ccr_zero_m")
         self._test_ccr(B=1, M=256, N=256, K=0, test_name="bmm_ccr_zero_k")
 
-    # def test_crr(self):
-    #     B = 32
-    #     M = 256
-    #     K = 256
-    #     N = 512
-    #     target = detect_target()
-    #     X = Tensor(
-    #         shape=[B, K, M],
-    #         dtype="float16",
-    #         name="input_0"
-    #     )
-    #     W = Tensor(
-    #         shape=[B, K, N],
-    #         dtype="float16",
-    #         name="input_1"
-    #     )
-    #     D = Tensor(
-    #         shape=[B, M, N],
-    #         dtype="float16",
-    #         name="input_2"
-    #     )
-    #     OP = ops.bmm_crr_add()
-    #     Y = OP(X, W, D)
-    #     Y._attrs["name"] = "output_0"
-    #     Y._attrs["is_output"] = True
-    #     module = compile_model(Y, target, "./tmp", "bmm_crr_add")
-    #     X_pt = torch.randn(B, K, M).cuda().half()
-    #     W_pt = torch.randn(B, K, N).cuda().half()
-    #     D_pt = torch.randn(B, M, N).cuda().half()
-
-    #     XT = torch.transpose(X_pt, 2, 1)
-    #     Y_pt = torch.bmm(XT, W_pt)
-    #     Y_pt = Y_pt + D_pt
-    #     Y_np = Y_pt.cpu().numpy()
-
-    #     x = X_pt.cpu().numpy()
-    #     w = W_pt.cpu().numpy()
-    #     d = D_pt.cpu().numpy()
-    #     module.SetInput("input_0", x)
-    #     module.SetInput("input_1", w)
-    #     module.SetInput("input_2", d)
-    #     module.benchmark()
-    #     y = module.GetOutput("output_0", [B, M, N])
-    #     np.testing.assert_allclose(Y_np,
-    #                                y,
-    #                                atol=1e-2, rtol=1e-2)
+    def test_rcr(self):
+        self._test_rcr(B=32, M=256, N=256, K=512, test_name="bmm_rcr_add")
+        self._test_rcr(B=0, M=256, N=256, K=512, test_name="bmm_rcr_zero_batch")
+        self._test_rcr(B=1, M=0, N=256, K=512, test_name="bmm_rcr_zero_m")
+        self._test_rcr(B=1, M=256, N=256, K=0, test_name="bmm_rcr_zero_k")
+
+    def test_crr(self):
+        self._test_crr(B=32, M=256, K=256, N=512, test_name="bmm_crr_add")
+
+    def test_ccc(self):
+        self._test_ccc(B=32, M=256, N=256, K=512, test_name="bmm_ccc_add")
+        self._test_ccc(B=0, M=256, N=256, K=512, test_name="bmm_ccc_zero_batch")
+        self._test_ccc(B=1, M=0, N=256, K=512, test_name="bmm_ccc_zero_m")
+        self._test_ccc(B=1, M=256, N=256, K=0, test_name="bmm_ccc_zero_k")
+
+    def test_rcc(self):
+        self._test_rcc(B=32, M=256, N=256, K=512, test_name="bmm_rcc_add")
+        self._test_rcc(B=0, M=256, N=256, K=512, test_name="bmm_rcc_zero_batch")
+        self._test_rcc(B=1, M=0, N=256, K=512, test_name="bmm_rcc_zero_m")
+        self._test_rcc(B=1, M=256, N=256, K=0, test_name="bmm_rcc_zero_k")
+
+    def test_rrc(self):
+        self._test_rrc(B=32, M=256, K=256, N=512, test_name="bmm_rrc_add")
+
+    def test_crc(self):
+        self._test_crc(B=32, M=256, K=256, N=512, test_name="bmm_crc_add")
+
+    def test_bmm_add_0_fp32_sm80(self, dtype="float32"):
+        self._test_rrr(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccr(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcr_add_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_0_bf16(self, dtype="bfloat16"):
+        self._test_rrr(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccr(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crr(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crr_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcr_add_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_1_fp32_sm80(self, dtype="float32"):
+        self._test_rrc(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcc_add_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_1_bf16(self, dtype="bfloat16"):
+        self._test_rrc(
+            B=8,
+            M=32,
+            K=8,
+            N=64,
+            test_name=f"bmm_rrc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_ccc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_crc(
+            B=8,
+            M=32,
+            K=16,
+            N=64,
+            test_name=f"bmm_crc_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            B=8,
+            M=32,
+            N=64,
+            K=16,
+            test_name=f"bmm_rcc_add_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    B=5,
+                    M=7,
+                    K=60,
+                    N=28,
+                    test_name="bmm_rrr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrr_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=60,
+                N=28,
+                test_name="bmm_rrr_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrr_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    B=5,
+                    M=7,
+                    N=60,
+                    K=28,
+                    test_name="bmm_rcr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcr_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=60,
+                K=28,
+                test_name="bmm_rcr_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcr_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccr(
+                    B=5,
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="bmm_ccr_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccr(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccr_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccr(
+                B=5,
+                M=60,
+                N=7,
+                K=28,
+                test_name="bmm_ccr_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccr(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccr_add_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crr(
+                    B=5,
+                    K=7,
+                    M=28,
+                    N=60,
+                    test_name="bmm_crr_add_wrong_alignment_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crr(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crr_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crr(
+                B=5,
+                K=7,
+                M=28,
+                N=60,
+                test_name="bmm_crr_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crr(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crr_add_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rrc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrc(
+                    B=5,
+                    M=7,
+                    K=60,
+                    N=28,
+                    test_name="bmm_rrc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrc_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=60,
+                N=28,
+                test_name="bmm_rrc_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrc(
+                B=5,
+                M=7,
+                K=64,
+                N=32,
+                test_name="bmm_rrc_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rcc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcc(
+                    B=5,
+                    M=7,
+                    N=60,
+                    K=28,
+                    test_name="bmm_rcc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcc_add_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=60,
+                K=28,
+                test_name="bmm_rcc_add_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcc(
+                B=5,
+                M=7,
+                N=64,
+                K=32,
+                test_name="bmm_rcc_add_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_ccc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_ccc(
+                    B=5,
+                    M=60,
+                    N=7,
+                    K=28,
+                    test_name="bmm_ccc_add_wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_ccc(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccc_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_ccc(
+                B=5,
+                M=60,
+                N=7,
+                K=28,
+                test_name="bmm_ccc_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_ccc(
+                B=5,
+                M=64,
+                N=7,
+                K=32,
+                test_name="bmm_ccc_add_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_crc_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_crc(
+                    B=5,
+                    K=7,
+                    M=28,
+                    N=60,
+                    test_name="bmm_crc_add_wrong_alignment_forse_sm90",
+                    dtype="float16",
+                )
+
+            self._test_crc(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crc_add_fp16_forse_sm90",
+                dtype="float16",
+            )
+            self._test_crc(
+                B=5,
+                K=7,
+                M=28,
+                N=60,
+                test_name="bmm_crc_add_fp32_forse_sm90",
+                dtype="float32",
+            )
+            self._test_crc(
+                B=5,
+                K=7,
+                M=32,
+                N=64,
+                test_name="bmm_crc_add_bk_bf16_forse_sm90",
+                dtype="bfloat16",
+            )
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMBroadcastTestCase(unittest.TestCase):
-    def _test_crr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_crr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-1]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_crr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -147,14 +833,14 @@ def _test_crr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_crr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         Y_pt = torch.matmul(XT, W_pt) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -183,13 +869,13 @@ def test_crr(self):
             test_name="broadcastable_bias3d",
         )
 
-    def _test_rrr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_rrr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-2], B_shape[-1]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_rrr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -197,13 +883,13 @@ def _test_rrr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         Y_pt = torch.matmul(X_pt, W_pt) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -232,13 +918,63 @@ def test_rrr(self):
             test_name="broadcastable_bias3d",
         )
 
-    def _test_ccr(self, A_shape, B_shape, bias_shape, test_name):
+    def _test_rcr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rcr_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT) + bias_pt
+
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcr(self):
+        self._test_rcr(
+            [1, 16, 8], [2, 32, 8], bias_shape=[32], test_name="broadcastable_bias1d"
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[16, 32],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_ccr(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
         M, N = A_shape[-1], B_shape[-2]
         B = max(A_shape[0], B_shape[0])
 
-        X = Tensor(shape=A_shape, dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=B_shape, dtype="float16", name="input_1", is_input=True)
-        bias = Tensor(shape=bias_shape, dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
         Y = ops.bmm_ccr_add()(X, W, bias)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
@@ -246,15 +982,15 @@ def _test_ccr(self, A_shape, B_shape, bias_shape, test_name):
         target = detect_target()
         module = compile_model(Y, target, "./tmp", "bmm_ccr_{}".format(test_name))
 
-        X_pt = torch.randn(*A_shape).cuda().half()
-        W_pt = torch.randn(*B_shape).cuda().half()
-        bias_pt = torch.randn(*bias_shape).cuda().half()
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
 
         XT = torch.transpose(X_pt, -2, -1)
         WT = torch.transpose(W_pt, -2, -1)
         Y_pt = torch.matmul(XT, WT) + bias_pt
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
         )
@@ -283,6 +1019,307 @@ def test_ccr(self):
             test_name="broadcastable_bias3d",
         )
 
+    def _test_crc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_crc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_crc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        Y_pt = torch.matmul(XT, W_pt).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def _test_rrc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-1]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rrc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        Y_pt = torch.matmul(X_pt, W_pt).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rrc(self):
+        self._test_rrc(
+            [1, 16, 8], [2, 8, 32], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_rcc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-2], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_rcc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_rcc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(X_pt, WT).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_rcc(self):
+        self._test_rcc(
+            [1, 16, 8], [2, 32, 8], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
+        )
+
+    def _test_ccc(self, A_shape, B_shape, bias_shape, test_name, dtype="float16"):
+        M, N = A_shape[-1], B_shape[-2]
+        B = max(A_shape[0], B_shape[0])
+
+        X = Tensor(shape=A_shape, dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=B_shape, dtype=dtype, name="input_1", is_input=True)
+        bias = Tensor(shape=bias_shape, dtype=dtype, name="input_2", is_input=True)
+        Y = ops.bmm_ccc_add()(X, W, bias)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", "bmm_ccc_{}".format(test_name))
+
+        X_pt = get_random_torch_tensor(A_shape, dtype)
+        W_pt = get_random_torch_tensor(B_shape, dtype)
+        bias_pt = get_random_torch_tensor(bias_shape, dtype)
+
+        XT = torch.transpose(X_pt, -2, -1)
+        WT = torch.transpose(W_pt, -2, -1)
+        Y_pt = torch.matmul(XT, WT).transpose(-2, -1) + bias_pt
+
+        y = get_torch_empty_tensor([B, N, M], dtype)
+        module.run_with_tensors(
+            {"input_0": X_pt, "input_1": W_pt, "input_2": bias_pt}, [y]
+        )
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+
+    def test_ccc(self):
+        self._test_ccc(
+            [1, 8, 16], [2, 32, 8], bias_shape=[16], test_name="broadcastable_bias1d"
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16],
+            test_name="broadcastable_bias1d_2",
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[32, 16],
+            test_name="broadcastable_bias2d",
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name="broadcastable_bias3d",
+        )
+
+    def test_bmm_add_broadcast_0_fp32_sm80(self, dtype="float32"):
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_broadcast_0_bf16(self, dtype="bfloat16"):
+        self._test_crr(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[16, 32],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcr(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrr(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 32],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccr(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 16, 32],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_broadcast_1_fp32_sm80(self, dtype="float32"):
+        self._test_crc(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[32, 16],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_bmm_add_broadcast_1_bf16(self, dtype="bfloat16"):
+        self._test_crc(
+            [1, 8, 16],
+            [2, 8, 32],
+            bias_shape=[32, 16],
+            test_name=f"broadcastable_bias2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rcc(
+            [1, 16, 8],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_rrc(
+            [1, 16, 8],
+            [2, 8, 32],
+            bias_shape=[1, 16],
+            test_name=f"broadcastable_bias1d_2_{dtype}",
+            dtype=dtype,
+        )
+        self._test_ccc(
+            [1, 8, 16],
+            [2, 32, 8],
+            bias_shape=[1, 32, 16],
+            test_name=f"broadcastable_bias3d_{dtype}",
+            dtype=dtype,
+        )
+
+
+filter_test_cases_by_test_env(BMMAddTestCase)
+filter_test_cases_by_test_env(BMMBroadcastTestCase)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_alpha.py b/tests/unittest/ops/test_bmm_alpha.py
index 4eff3f11a..bf255c57f 100644
--- a/tests/unittest/ops/test_bmm_alpha.py
+++ b/tests/unittest/ops/test_bmm_alpha.py
@@ -21,6 +21,11 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import graph_utils
 
 
@@ -41,26 +46,27 @@ def _test_bmm_alpha(
         expected_num_ops,
         use_fp16_acc=False,
         with_add=False,
+        dtype="float16",
     ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         if X_trans:
-            X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
+            X = Tensor(shape=[B, K, M], dtype=dtype, name="input_0", is_input=True)
         else:
-            X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
+            X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
         if W_trans:
-            W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+            W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         else:
-            W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+            W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         if with_add:
-            D = Tensor(shape=[B, M, N], dtype="float16", name="input_2", is_input=True)
+            D = Tensor(shape=[B, M, N], dtype=dtype, name="input_2", is_input=True)
         BMM_OP = bmm_op()
         Y1 = BMM_OP(X, W, D) if with_add else BMM_OP(X, W)
         elem_func_type = FuncEnum.DIV if is_div else FuncEnum.MUL
-        Y2 = ops.elementwise(elem_func_type)(Y1, Tensor([], value=cst_val))
+        Y2 = ops.elementwise(elem_func_type)(Y1, Tensor([], value=cst_val, dtype=dtype))
         Y2._attrs["name"] = "output_0"
         Y2._attrs["is_output"] = True
         module = compile_model(
-            Y2, target, "./tmp", f"bmm_alpha_{B}_{M}_{N}_{K}_{use_fp16_acc}"
+            Y2, target, "./tmp", f"bmm_alpha_{B}_{M}_{N}_{K}_{use_fp16_acc}_{dtype}"
         )
         expected_cst_val = 1.0 / float(cst_val) if is_div else float(cst_val)
 
@@ -84,15 +90,15 @@ def _test_bmm_alpha(
         sorted_ops = graph_utils.get_sorted_ops(sorted_graph)
         self.assertEqual(len(sorted_ops), expected_num_ops)
         if X_trans:
-            X_pt = torch.randn(B, K, M).cuda().half()
+            X_pt = get_random_torch_tensor([B, K, M], dtype)
         else:
-            X_pt = torch.randn(B, M, K).cuda().half()
+            X_pt = get_random_torch_tensor([B, M, K], dtype)
         if W_trans:
-            W_pt = torch.randn(B, N, K).cuda().half()
+            W_pt = get_random_torch_tensor([B, N, K], dtype)
         else:
-            W_pt = torch.randn(B, K, N).cuda().half()
+            W_pt = get_random_torch_tensor([B, K, N], dtype)
         if with_add:
-            D_pt = torch.randn(B, M, N).cuda().half()
+            D_pt = get_random_torch_tensor([B, M, N], dtype)
 
         def pt_bmm():
             XT = torch.transpose(X_pt, 2, 1) if X_trans else X_pt
@@ -108,7 +114,7 @@ def pt_bmm():
         inputs = {"input_0": X_pt, "input_1": W_pt}
         if with_add:
             inputs["input_2"] = D_pt
-        y = torch.empty([B, M, N]).cuda().half()
+        y = get_torch_empty_tensor([B, M, N], dtype)
         module.run_with_tensors(inputs, [y])
 
         if X_pt.nelement() == 0 or W_pt.nelement() == 0:
@@ -277,6 +283,74 @@ def test_bmm_alpha(self):
             use_fp16_acc=False,
         )
 
+    def test_bmm_alpha_fp32_sm80(self):
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+            dtype="float",
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr_add,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=False,
+            with_add=True,
+            dtype="float",
+        )
+
+    def test_bmm_alpha_bf16(self):
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rcr,
+            is_div=False,
+            X_trans=False,
+            W_trans=True,
+            B=1,
+            M=1000000,
+            N=3,
+            K=32,
+            expected_num_tensors=3,
+            expected_num_ops=1,
+            cst_val=2.3,
+            use_fp16_acc=False,
+            dtype="bfloat16",
+        )
+        self._test_bmm_alpha(
+            bmm_op=ops.bmm_rrr_add,
+            is_div=False,
+            X_trans=False,
+            W_trans=False,
+            B=2,
+            M=12,
+            N=8,
+            K=4,
+            cst_val=0.32,
+            expected_num_tensors=4,
+            expected_num_ops=1,
+            use_fp16_acc=False,
+            with_add=True,
+            dtype="bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(BMMAlphaTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_permute.py b/tests/unittest/ops/test_bmm_permute.py
index df7420811..b8266c99c 100644
--- a/tests/unittest/ops/test_bmm_permute.py
+++ b/tests/unittest/ops/test_bmm_permute.py
@@ -20,21 +20,27 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
 class BMMPermuteTestCase(unittest.TestCase):
-    def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False):
+    def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, K, N], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr_permute(shape=(d1,))
         if copy_op:
             OP = ops.bmm_rrr_permute(**OP._get_op_attributes())
@@ -43,15 +49,15 @@ def _test_rrr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rrr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, K, N).cuda().half()
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, K, N], dtype)
 
             Y_l = torch.bmm(X_pt, W_pt)
             Y_r = Y_l.reshape(b // d1, d1, m, N)
             Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
 
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
@@ -69,16 +75,14 @@ def test_rrr(self):
             self._test_rrr([24], [80], N=0, K=96, d1=12, test_name="permute1_zero_n")
             self._test_rrr([24], [0], N=32, K=96, d1=12, test_name="permute1_zero_m")
 
-    def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False):
+    def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False, dtype="float16"):
         target = detect_target()
         batch_dim = shape_utils.gen_int_var_min_max(bs, name="batch_size")
         m_dim = shape_utils.gen_int_var_min_max(ms, name="m")
         X = Tensor(
-            shape=[batch_dim, m_dim, K], dtype="float16", name="input_0", is_input=True
-        )
-        W = Tensor(
-            shape=[batch_dim, N, K], dtype="float16", name="input_1", is_input=True
+            shape=[batch_dim, m_dim, K], dtype=dtype, name="input_0", is_input=True
         )
+        W = Tensor(shape=[batch_dim, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr_permute(shape=(d1,))
         if copy_op:
             OP = ops.bmm_rcr_permute(**OP._get_op_attributes())
@@ -87,16 +91,16 @@ def _test_rcr(self, bs, ms, N, K, d1, test_name, copy_op=False):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "bmm_rcr_{}".format(test_name))
 
-        for (b, m) in itertools.product(bs, ms):
-            X_pt = torch.randn(b, m, K).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+        for b, m in itertools.product(bs, ms):
+            X_pt = get_random_torch_tensor([b, m, K], dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype)
 
             WT = torch.transpose(W_pt, 2, 1)
             Y_l = torch.bmm(X_pt, WT)
             Y_r = Y_l.reshape(b // d1, d1, m, N)
             Y_pt = torch.permute(Y_r, [0, 2, 1, 3])
 
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
@@ -116,6 +120,54 @@ def test_rcr(self):
             )
             self._test_rcr([24], [80], N=96, K=0, d1=12, test_name="permute1_zero_k")
 
+    def test_bmm_permute_fp32(self):
+        self._test_rrr(
+            [10], [8], N=88, K=64, d1=10, test_name="permute3_float", dtype="float"
+        )
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_copy_op_float",
+            copy_op=True,
+            dtype="float",
+        )
+        self._test_rcr(
+            [10], [8], N=64, K=88, d1=10, test_name="permute3_float", dtype="float"
+        )
+
+    def test_bmm_permute_bf16(self):
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_bfloat16",
+            dtype="bfloat16",
+        )
+        self._test_rrr(
+            [10],
+            [8],
+            N=88,
+            K=64,
+            d1=10,
+            test_name="permute3_copy_op_bfloat16",
+            copy_op=True,
+            dtype="bfloat16",
+        )
+        self._test_rcr(
+            [10],
+            [8],
+            N=64,
+            K=88,
+            d1=10,
+            test_name="permute3_bfloat16",
+            dtype="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_bmm_rcr_n1.py b/tests/unittest/ops/test_bmm_rcr_n1.py
index f6a32c6a0..27680952e 100644
--- a/tests/unittest/ops/test_bmm_rcr_n1.py
+++ b/tests/unittest/ops/test_bmm_rcr_n1.py
@@ -22,24 +22,47 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMRcrN1TestCase(unittest.TestCase):
-    def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rcr_n1(
+        self,
+        Bs,
+        Ms,
+        N,
+        K,
+        use_fp16_acc,
+        test_name,
+        atol=1e-1,
+        rtol=1e-1,
+        dtype="float16",
+    ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
         BDim = shape_utils.gen_int_var_min_max(Bs, name="batch")
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(
             shape=[BDim, MDim, IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[BDim, IntImm(N), IntImm(K)],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -48,22 +71,26 @@ def _test_rcr_n1(self, Bs, Ms, N, K, use_fp16_acc, test_name):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", f"bmm_rcr_n1_{use_fp16_acc}_{test_name}"
+            Y,
+            target,
+            "./tmp",
+            f"bmm_rcr_n1_{use_fp16_acc}_{test_name}_{self.test_count}",
         )
         for B, M in itertools.product(Bs, Ms):
             logging.info(f"Testing {B=} {M=}")
-            X_pt = torch.randn(B, M, K).cuda().half()
-            W_pt = torch.randn(B, N, K).cuda().half()
+            X_pt = get_random_torch_tensor((B, M, K), dtype)
+            W_pt = get_random_torch_tensor((B, N, K), dtype)
 
             Y_pt = torch.bmm(X_pt, torch.transpose(W_pt, 2, 1))
 
-            y = torch.empty([B, M, N]).half().cuda()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
+        self.test_count += 1
 
     def test_rcr_n1(self):
         self._test_rcr_n1([1], [1000000], 1, 32, True, "static")
@@ -84,6 +111,44 @@ def test_rcr_n1(self):
         self._test_rcr_n1([1], [100], 1, 0, False, "zero_k")
         self._test_rcr_n1([1], [0], 1, 3, False, "zero_m")
 
+    def test_bmm_rcr_n1_float32(self):
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, True, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, False, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 7, True, "static_float32", dtype="float32"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 123, False, "static_float32", dtype="float32"
+        )
+
+    def test_bmm_rcr_n1_bf16(self):
+        self._test_rcr_n1(
+            [1],
+            [1000000],
+            1,
+            32,
+            True,
+            "static_bfloat16",
+            atol=2e-1,
+            rtol=2e-1,
+            dtype="bfloat16",
+        )
+        self._test_rcr_n1(
+            [1], [1000000], 1, 32, False, "static_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 7, True, "static_bfloat16", dtype="bfloat16"
+        )
+        self._test_rcr_n1(
+            [1, 5, 8], [100], 1, 123, False, "static_bfloat16", dtype="bfloat16"
+        )
+
+
+filter_test_cases_by_test_env(BMMRcrN1TestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
index 5bf158dc2..6114c1f93 100644
--- a/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
+++ b/tests/unittest/ops/test_bmm_rrr_k1_tanh.py
@@ -19,37 +19,55 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMRrrK1TanhTestCase(unittest.TestCase):
-    def _test_rrr(self, B, M, K, N, test_name):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(self, B, M, K, N, test_name, dtype="float16"):
         target = detect_target()
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rrr_k1_tanh()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self.test_count}")
+        X_pt = get_random_torch_tensor([B, M, K], dtype)
+        W_pt = get_random_torch_tensor([B, K, N], dtype)
 
         Y_pt = torch.bmm(X_pt, W_pt)
         Y_pt = torch.tanh(Y_pt)
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
         if X_pt.nelement() == 0 or W_pt.nelement() == 0:
             pass
         else:
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rrr(self):
+    def test_bmm_rrr_k1_tanh_float16(self):
         self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1")
         self._test_rrr(B=1024, M=0, K=1, N=32, test_name="bmm_rrr_k1_zero_m")
         self._test_rrr(B=1024, M=32, K=0, N=32, test_name="bmm_rrr_k1_zero_k")
 
+    def test_bmm_rrr_k1_tanh_float32(self):
+        self._test_rrr(B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="float32")
+
+    def test_bmm_rrr_k1_tanh_bf16(self):
+        self._test_rrr(
+            B=1024, M=32, K=1, N=32, test_name="bmm_rrr_k1", dtype="bfloat16"
+        )
+
+
+filter_test_cases_by_test_env(BMMRrrK1TanhTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_softmax.py b/tests/unittest/ops/test_bmm_softmax.py
index 3b8350528..a56e9cc52 100644
--- a/tests/unittest/ops/test_bmm_softmax.py
+++ b/tests/unittest/ops/test_bmm_softmax.py
@@ -19,47 +19,79 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skip("BMM + Softmax is disabled for now")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class BMMSoftmaxTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_bmm_rcr_softmax(
-        self, B=16, M=16, K=64, N=24, test_name="bmm_rcr_softmax"
+        self,
+        B=16,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="bmm_rcr_softmax",
     ):
-
-        X = Tensor(shape=[B, M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[B, M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.bmm_rcr_softmax()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning(__file__, "Skip this test on SM75")
-            return
-        if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
-            return
-        module = compile_model(Y, target, "./tmp", test_name)
-        X_pt = torch.randn(B, M, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
+        x_pt = get_random_torch_tensor([B, M, K], dtype)
+        w_pt = get_random_torch_tensor([B, N, K], dtype)
+        wt_pt = torch.transpose(w_pt, 2, 1)
+        y_pt = torch.bmm(x_pt, wt_pt)
+        y_pt = torch.softmax(y_pt, dim=-1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt}
+        y = get_torch_empty_tensor([B, M, N], dtype)
+        module.run_with_tensors(inputs, [y])
+
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+        torch.testing.assert_close(
+            torch.argmax(y, axis=2),
+            torch.argmax(y_pt, axis=2),
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+    def test_bmm_rcr_softmax_float16(self):
+        self._test_bmm_rcr_softmax(
+            B=16,
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="bmm_rcr_softmax_fp16_1",
+        )
 
-        WT = torch.transpose(W_pt, 2, 1)
-        Y_pt = torch.bmm(X_pt, WT)
-        Y_pt = torch.softmax(Y_pt, dim=-1)
+    def test_bmm_rcr_softmax_float32_sm80(self):
+        self._test_bmm_rcr_softmax(
+            B=16,
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="bmm_rcr_softmax_fp32_1",
+        )
 
-        y = torch.empty([B, M, N]).cuda().half()
-        module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
-        eps = 1e-1
-        self.assertTrue(torch.allclose(Y_pt, y, atol=eps, rtol=eps))
 
-    def test_bmm_softmax(self):
-        self._test_bmm_rcr_softmax()
+filter_test_cases_by_test_env(BMMSoftmaxTestCase)
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_bmm_softmax_bmm.py b/tests/unittest/ops/test_bmm_softmax_bmm.py
index 23c99b64b..077480c51 100644
--- a/tests/unittest/ops/test_bmm_softmax_bmm.py
+++ b/tests/unittest/ops/test_bmm_softmax_bmm.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils import shape_utils
 
 
@@ -37,7 +38,6 @@ def build_causal_attention_mask(bsz, seq_len, dtype):
     return mask
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Only supported by ROCM.")
 class BMMSoftmaxBMMTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(BMMSoftmaxBMMTestCase, self).__init__(*args, **kwargs)
@@ -69,7 +69,7 @@ def _test_bmm_permute(
             shape=[batch_dim, N, D], dtype="float16", name="input_2", is_input=True
         )
 
-        scale = head_dim ** -0.5
+        scale = head_dim**-0.5
 
         OP = ops.bmm_softmax_bmm_permute(shape=(num_heads,), scale=scale, causal=causal)
         if copy_op:
@@ -83,7 +83,7 @@ def _test_bmm_permute(
             Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
         )
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = torch.randn(b, m, K).cuda().half()  # Q
             W_pt = torch.randn(b, N, K).cuda().half()  # K
             B1_pt = torch.randn(b, N, D).cuda().half()  # V
@@ -142,7 +142,7 @@ def _test_b2b(
             shape=[batch_dim, N, D], dtype="float16", name="input_2", is_input=True
         )
 
-        scale = head_dim ** -0.5
+        scale = head_dim**-0.5
 
         OP = ops.bmm_softmax_bmm(scale=scale)
         if copy_op:
@@ -156,7 +156,7 @@ def _test_b2b(
             Y, target, "./tmp", f"bmm_{test_name}_permute", dll_name=dll_name
         )
 
-        for (b, m) in itertools.product(bs, ms):
+        for b, m in itertools.product(bs, ms):
             X_pt = torch.randn(b, m, K).cuda().half()  # Q
             W_pt = torch.randn(b, N, K).cuda().half()  # K
             B1_pt = torch.randn(b, N, D).cuda().half()  # V
@@ -172,7 +172,7 @@ def _test_b2b(
             else:
                 self.assertTrue(torch.allclose(Y2_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_rcr(self):
+    def test_rcr_rocm(self):
         # FIXME: re-enable it after we fix the missing parameter for bmm_softmax_bmm
         # self._test_b2b([16], [576], N=576, K=64, D=64, test_name="static")
         self._test_bmm_permute([24], [256], N=256, K=64, D=64, test_name="static")
@@ -206,5 +206,8 @@ def test_rcr(self):
         )
 
 
+filter_test_cases_by_test_env(BMMSoftmaxBMMTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_cast.py b/tests/unittest/ops/test_cast.py
new file mode 100644
index 000000000..715befc73
--- /dev/null
+++ b/tests/unittest/ops/test_cast.py
@@ -0,0 +1,97 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class TestCast(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_cast(
+        self,
+        shape,
+        dtype="float32",
+        cast_dtype="bfloat16",
+        test_name="cast",
+    ) -> None:
+        if not isinstance(shape, list):
+            shape = [shape]
+
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Y = ops.cast()(X, cast_dtype)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        y = get_torch_empty_tensor(shape, dtype=cast_dtype)
+        inputs = {"X": x}
+        outputs = {"Y": y}
+        module.run_with_tensors(inputs, outputs)
+
+        y_pt = x.to(string_to_torch_dtype(cast_dtype))
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param(1, "float16", "bfloat16", [1], "float16_to_bfloat16"),
+            param(2, "float16", "float32", [10, 20], "float16_to_float32"),
+            param(3, "bfloat16", "float16", [10, 20, 30], "bfloat16_to_float16"),
+            param(4, "bfloat16", "float32", 123, "bfloat16_to_float32"),
+            param(5, "float32", "float16", [20, 30], "float32_to_float16"),
+            param(6, "float32", "bfloat16", [1, 128], "float32_to_bfloat16"),
+        ]
+    )
+    def test_cast(
+        self,
+        i,
+        dtype,
+        cast_dtype,
+        shape,
+        test_name,
+    ):
+        self._test_cast(
+            shape=shape,
+            dtype=dtype,
+            cast_dtype=cast_dtype,
+            test_name=test_name,
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_chunk.py b/tests/unittest/ops/test_chunk.py
index 7da51ee6d..fe1f9cdcf 100644
--- a/tests/unittest/ops/test_chunk.py
+++ b/tests/unittest/ops/test_chunk.py
@@ -26,9 +26,6 @@
 
 
 class ChunkTestCase(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(ChunkTestCase, self).__init__(*args, **kwargs)
-
     def _run_chunk(
         self,
         *,
@@ -41,7 +38,12 @@ def _run_chunk(
 
         chunk_op = ops.chunk()
         target = detect_target()
-        X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
+        X = Tensor(
+            shape=input_shape,
+            dtype=input_type,
+            name="input_0",
+            is_input=True,
+        )
         Ys = chunk_op(X, chunks, dim)
         for idx, Y in enumerate(Ys):
             Y._attrs["name"] = "output_{}".format(idx)
@@ -52,13 +54,13 @@ def _run_chunk(
         for batch_size in input_shape[0]._attrs["values"]:
             logging.info(f"Testing {batch_size=}")
             x_pt = get_random_torch_tensor(
-                [batch_size, *[v.value() for v in input_shape[1:]]], input_type
+                [batch_size, *[v.value() for v in input_shape[1:]]],
+                input_type,
             )
             ys_pt = torch.chunk(x_pt, chunks, dim)
-            y_shapes = [Y_pt.size() for Y_pt in ys_pt]
             outputs = {
-                f"output_{idx}": torch.empty(y_shape).cuda().half()
-                for idx, y_shape in enumerate(y_shapes)
+                f"output_{idx}": torch.empty_like(Y_pt)
+                for idx, Y_pt in enumerate(ys_pt)
             }
 
             module.run_with_tensors([x_pt], outputs)
@@ -68,7 +70,7 @@ def _run_chunk(
                     torch.allclose(y_pt, outputs[f"output_{idx}"], atol=1e-2, rtol=1e-2)
                 )
 
-    def test_chunk(self):
+    def test_chunk_fp16(self):
         self._run_chunk(
             input_shape=[IntImm(17), IntImm(5), IntImm(29)],
             chunks=2,
@@ -88,7 +90,28 @@ def test_chunk(self):
             input_type="float16",
         )
 
-    def test_dynamic_chunk(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_chunk_fp32(self):
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=2,
+            dim=0,
+            input_type="float32",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=7,
+            dim=1,
+            input_type="float32",
+        )
+        self._run_chunk(
+            input_shape=[IntImm(17), IntImm(5), IntImm(29)],
+            chunks=11,
+            dim=2,
+            input_type="float32",
+        )
+
+    def test_dynamic_chunk_fp16(self):
         self._run_chunk(
             input_shape=[
                 IntVar(values=[13, 17], name="batch_dim"),
@@ -99,7 +122,10 @@ def test_dynamic_chunk(self):
             dim=1,
             input_type="float16",
         )
-        with self.assertRaises(RuntimeError) as context:
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Not implemented: chunk along dynamic axes",
+        ):
             self._run_chunk(
                 input_shape=[
                     IntVar(values=[13, 17], name="batch_dim"),
@@ -110,8 +136,18 @@ def test_dynamic_chunk(self):
                 dim=0,
                 input_type="float16",
             )
-        self.assertTrue(
-            "Not implemented: chunk along dynamic axes" in str(context.exception)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_chunk_fp32(self):
+        self._run_chunk(
+            input_shape=[
+                IntVar(values=[13, 17], name="batch_dim"),
+                IntImm(5),
+                IntImm(29),
+            ],
+            chunks=2,
+            dim=1,
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_clamp_nan_to_num.py b/tests/unittest/ops/test_clamp_nan_to_num.py
index 0dc6e5415..9e065acd9 100644
--- a/tests/unittest/ops/test_clamp_nan_to_num.py
+++ b/tests/unittest/ops/test_clamp_nan_to_num.py
@@ -23,17 +23,33 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ClampTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _create_shape_from_list(self, shape: List[int]) -> IntVar:
         if len(shape) > 1:
             return IntVar(shape)
         return IntImm(shape[0])
 
-    def _float_to_tensor(self, name: str, value: float) -> Tensor:
-        return Tensor(shape=[], dtype="float16", name=name, value=value)
+    def _float_to_tensor(
+        self,
+        name: str,
+        value: float,
+        dtype="float16",
+    ) -> Tensor:
+        return Tensor(
+            shape=[],
+            dtype=dtype,
+            name=name,
+            value=value,
+        )
 
     def _test_helper(
         self,
@@ -46,133 +62,218 @@ def _test_helper(
         test_name: str,
         func: FuncEnum,
         get_expected: Callable[[torch.Tensor], torch.Tensor],
+        dtype="float16",
     ):
         self.assertGreater(len(input_shape), 0)
         X = Tensor(
             shape=[self._create_shape_from_list(shape) for shape in input_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input",
             is_input=True,
         )
-        a_tensor = self._float_to_tensor("a", arg_a)
-        b_tensor = self._float_to_tensor("b", arg_b)
-        c_tensor = self._float_to_tensor("c", arg_c)
+        a_tensor = self._float_to_tensor("a", arg_a, dtype=dtype)
+        b_tensor = self._float_to_tensor("b", arg_b, dtype=dtype)
+        c_tensor = self._float_to_tensor("c", arg_c, dtype=dtype)
 
         result = ops.elementwise(func)(X, a_tensor, b_tensor, c_tensor)
         result._attrs["is_output"] = True
         result._attrs["name"] = "output"
 
         target = detect_target()
-        module = compile_model(result, target, "./tmp", test_name)
+        module = compile_model(result, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
+        torch_dtype = string_to_torch_dtype(dtype)
         for shape in itertools.product(*input_shape):
-            X_pt = torch.randn(shape, dtype=torch.half).cuda()
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
             if add_nans:
                 X_pt[0].fill_(float("nan"))
             if add_infs:
                 X_pt[1].fill_(float("inf"))
                 X_pt[2].fill_(-float("inf"))
 
-            actual = torch.empty(shape).cuda().half()
+            actual = torch.empty_like(X_pt)
             module.run_with_tensors([X_pt], [actual])
-
-            expected = get_expected(X_pt).cuda()
-            self.assertTrue(torch.equal(expected, actual))
+            expected = get_expected(X_pt, torch_dtype)
+            torch.testing.assert_close(expected, actual)
 
     def _test_nan_to_num(
         self,
-        test_num: int,
         input_shape: List[List[int]],
         nan_replacement: float,
         inf_replacement: float,
         neginf_replacement: float,
         add_nans: bool = False,
         add_infs: bool = False,
+        test_name: str = "nan_to_num",
+        dtype="float16",
     ):
         nan_to_num_pt = (
-            lambda x: x.to(torch.float)
+            lambda x, torch_dtype: x.to(torch.float)
             .nan_to_num(
-                posinf=inf_replacement, neginf=neginf_replacement, nan=nan_replacement
+                posinf=inf_replacement,
+                neginf=neginf_replacement,
+                nan=nan_replacement,
             )
-            .to(torch.half)
+            .to(torch_dtype)
         )
         self._test_helper(
-            input_shape,
-            nan_replacement,
-            inf_replacement,
-            neginf_replacement,
-            add_nans,
-            add_infs,
-            f"nan_to_num_{test_num}",
-            FuncEnum.NAN_TO_NUM,
-            nan_to_num_pt,
+            input_shape=input_shape,
+            arg_a=nan_replacement,
+            arg_b=inf_replacement,
+            arg_c=neginf_replacement,
+            add_nans=add_nans,
+            add_infs=add_infs,
+            test_name=test_name,
+            func=FuncEnum.NAN_TO_NUM,
+            get_expected=nan_to_num_pt,
+            dtype=dtype,
         )
 
     def _test_clamp_nan_to_num(
         self,
-        test_num: int,
         input_shape: List[List[int]],
         clamp_min: float,
         clamp_max: float,
         nan_replacement: float,
         add_nans: bool = False,
+        test_name: str = "clamp_nan_to_num",
+        dtype="float16",
     ):
         clamp_nan_to_num_pt = (
-            lambda x: x.to(torch.float)
+            lambda x, torch_dtype: x.to(torch.float)
             .clamp(clamp_min, clamp_max)
             .nan_to_num(nan=nan_replacement)
-            .to(torch.half)
+            .to(torch_dtype)
         )
         self._test_helper(
-            input_shape,
-            clamp_min,
-            clamp_max,
-            nan_replacement,
-            add_nans,
-            False,
-            f"clamp_nan_to_num_{test_num}",
-            FuncEnum.CLAMP_NAN_TO_NUM,
-            clamp_nan_to_num_pt,
+            input_shape=input_shape,
+            arg_a=clamp_min,
+            arg_b=clamp_max,
+            arg_c=nan_replacement,
+            add_nans=add_nans,
+            add_infs=False,
+            test_name=test_name,
+            func=FuncEnum.CLAMP_NAN_TO_NUM,
+            get_expected=clamp_nan_to_num_pt,
+            dtype=dtype,
         )
 
-    def test_clamp_nan_to_num(self):
+    def test_clamp_nan_to_num_fp16(self):
         clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
-        test_num = 0
-        for clamp_args in clamp_arg_sets:
+        for clamp_min, clamp_max, nan_replacement in clamp_arg_sets:
             self._test_clamp_nan_to_num(
-                test_num,
-                [[40, 2], [40], [40]],
-                *clamp_args,
+                input_shape=[[40, 2], [40], [40]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
                 add_nans=False,
+                test_name="clamp_nan_to_num_fp16",
+                dtype="float16",
             )
             self._test_clamp_nan_to_num(
-                test_num + 1,
-                [[40, 3], [3], [3]],
-                *clamp_args,
+                input_shape=[[40, 3], [3], [3]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=True,
+                test_name="clamp_nan_to_num_fp16",
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_clamp_nan_to_num_fp32(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        for clamp_min, clamp_max, nan_replacement in clamp_arg_sets:
+            self._test_clamp_nan_to_num(
+                input_shape=[[40, 2], [40], [40]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=False,
+                test_name="clamp_nan_to_num_fp32",
+                dtype="float32",
+            )
+            self._test_clamp_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                clamp_min=clamp_min,
+                clamp_max=clamp_max,
+                nan_replacement=nan_replacement,
+                add_nans=True,
+                test_name="clamp_nan_to_num_fp32",
+                dtype="float32",
+            )
+
+    def test_nan_to_num_fp16(self):
+        clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
+        for nan_replacement, inf_replacement, neginf_replacement in clamp_arg_sets:
+            self._test_nan_to_num(
+                input_shape=[[40, 2], [40], [40]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=False,
+                add_infs=False,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=float("inf"),
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp16",
+                dtype="float16",
             )
-            test_num += 2
 
-    def test_nan_to_num(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_nan_to_num_fp32(self):
         clamp_arg_sets = [(-1.0, 2.0, 0.0), (-42.0, 2.0, 43.0)]
-        test_num = 0
-        for clamp_args in clamp_arg_sets:
+        for nan_replacement, inf_replacement, neginf_replacement in clamp_arg_sets:
             self._test_nan_to_num(
-                test_num,
-                [[40, 2], [40], [40]],
-                *clamp_args,
+                input_shape=[[40, 2], [40], [40]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=False,
                 add_infs=False,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
+            )
+            self._test_nan_to_num(
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=nan_replacement,
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
+                add_nans=True,
+                add_infs=True,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
             )
             self._test_nan_to_num(
-                test_num + 1,
-                [[40, 3], [3], [3]],
-                *clamp_args,
+                input_shape=[[40, 3], [3], [3]],
+                nan_replacement=float("inf"),
+                inf_replacement=inf_replacement,
+                neginf_replacement=neginf_replacement,
                 add_nans=True,
                 add_infs=True,
+                test_name="nan_to_num_fp32",
+                dtype="float32",
             )
-            test_num += 2
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_concatenate.py b/tests/unittest/ops/test_concatenate.py
index 0a5eec59e..a7560f661 100644
--- a/tests/unittest/ops/test_concatenate.py
+++ b/tests/unittest/ops/test_concatenate.py
@@ -12,17 +12,17 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 
 class ConcatenateTestCase(unittest.TestCase):
@@ -33,12 +33,6 @@ def __init__(self, *args, **kwargs):
     def _run_concatenate(
         self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
-
         # generate torch reference result
         input_tensors_pt = [
             get_random_torch_tensor(shape, input_type)
@@ -60,9 +54,6 @@ def _run_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
-
-        logging.info("AITemplate output_shape: {}".format(y_shape))
 
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", "concatenate", dll_name=dll_name)
@@ -70,19 +61,15 @@ def _run_concatenate(
         input_tensors_ait = {
             f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(input_tensors_ait, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.equal(Y_pt, y))
+
         self.test_count += 1
 
     def _run_batch_concatenate(
         self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
     ):
-        logging.info(
-            "Batch test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
         target = detect_target()
         BATCH_DIM_NAME = "input_batch"
         batch_dim = shape_utils.gen_int_var_min_max(
@@ -110,7 +97,6 @@ def _run_batch_concatenate(
             Y, target, "./tmp", f"concatenate_batched_{batch_tag}", dll_name=dll_name
         )
         for batch in batch_sizes:
-            logging.info("checking batch: {}".format(batch))
             input_tensors_pt = [
                 get_random_torch_tensor([batch, *shape], input_type)
                 for i, shape in enumerate(input_shapes)
@@ -123,9 +109,9 @@ def _run_batch_concatenate(
             input_tensors_ait = {
                 f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
             }
-            y = torch.empty_like(Y_pt).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors(input_tensors_ait, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.equal(Y_pt, y))
             self.test_count += 1
 
     def _run_masked_concatenate(
@@ -137,23 +123,16 @@ def _run_masked_concatenate(
         dim=None,
         input_type="float16",
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
-                input_shapes=input_shapes, input_masks=input_masks, dim=dim
-            )
-        )
-
         # generate torch reference result
         input_tensors_pt = [
             get_random_torch_tensor(shape, input_type)
             for i, shape in enumerate(input_shapes)
         ]
-        Y_pt = (
+        y_pt = (
             torch.cat(input_tensors_pt)
             if dim is None
             else torch.cat(input_tensors_pt, dim)
         )
-        y_pt = Y_pt.cpu().numpy()
 
         target = detect_target()
         inputs = [
@@ -165,7 +144,6 @@ def _run_masked_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
 
         # setup new input_masks, inputs and input_accessors
         inputs = [i for mask, i in zip(input_masks, inputs) if mask is True]
@@ -178,8 +156,6 @@ def _run_masked_concatenate(
         concatenate_op._attrs["inputs"] = inputs
         concatenate_op._attrs["input_accessors"] = input_accessors
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
-
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(
             Y, target, "./tmp", "concatenate_masked", dll_name=dll_name
@@ -189,20 +165,16 @@ def _run_masked_concatenate(
         for i, x_tensor_pt in enumerate(input_tensors_pt):
             if input_masks[i]:
                 inputs.append(x_tensor_pt)
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(y_pt)
         module.run_with_tensors(inputs, [y])
 
-        split_sections = []
-        split_offset = 0
-        for shape in input_shapes[:-1]:
-            split_offset = split_offset + shape[dim]
-            split_sections.append(split_offset)
+        split_sections = [shape[dim] for shape in input_shapes]
 
-        ys_pt = np.split(y_pt, split_sections, axis=dim)
-        ys = np.split(y.cpu().numpy(), split_sections, axis=dim)
+        ys_pt = torch.split(y_pt, split_sections, dim=dim)
+        ys = torch.split(y, split_sections, dim=dim)
         for mask, pt, actual in zip(input_masks, ys_pt, ys):
             if mask is True:
-                np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
+                self.assertTrue(torch.equal(pt, actual))
         self.test_count += 1
 
     def test_batch_cat(self):
@@ -248,12 +220,6 @@ def test_batch_cat(self):
             input_shapes=([2, 1, 4], [2, 3, 4]),
             dim=2,
         )
-        self._run_batch_concatenate(
-            batch_sizes=[3, 5, 9],
-            concatenate_op=ops.concatenate(),
-            input_shapes=([2, 3, 4], [2, 3, 2]),
-            dim=3,
-        )
 
     def test_cat(self):
         self._run_concatenate(
@@ -321,11 +287,6 @@ def test_cat(self):
             dim=3,
         )
 
-        self._run_concatenate(
-            concatenate_op=ops.concatenate(),
-            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
-        )
-
         self._run_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
@@ -346,10 +307,6 @@ def test_cat(self):
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
             dim=3,
         )
-        self._run_concatenate(
-            concatenate_op=ops.concatenate(),
-            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
-        )
         self._run_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
@@ -388,13 +345,91 @@ def test_masked_cat(self):
             input_masks=[False, True, False],
             dim=2,
         )
+
+    @parameterized.expand(("float16", "float32", "bfloat16"))
+    def test_floats(self, dtype):
+        if detect_target().name() != "cuda" and dtype != "float16":
+            self.skipTest(
+                f"{detect_target().name()} backend is not supported for {dtype} input type"
+            )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate(),
+            input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+            input_type=dtype,
+        )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate(),
             input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
             input_masks=[False, True, False],
             dim=2,
+            input_type=dtype,
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate(),
+            input_shapes=([2, 3, 4], [2, 3, 2]),
+            dim=3,
+            input_type=dtype,
         )
 
+    def _test_concatenate_shape(self, in_shapes, out_shape, dim):
+        Xs = [
+            Tensor(
+                shape=in_shape,
+                name=f"input_{idx}",
+                is_input=True,
+            )
+            for idx, in_shape in enumerate(in_shapes)
+        ]
+
+        Y = ops.concatenate()(Xs, dim)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_concatenate_shape_var(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+
+        in_shapes = [[var, 2, 3] for var in [var1, var2, var3]]
+        ovar1 = IntVar(values=[11, 18], symbolic_value=sym1 + sym2 + sym3)
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], -3)
+
+    def test_concatenate_shape_mix(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        imm1 = IntImm(17)
+        imm2 = IntImm(19)
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+
+        in_shapes = [[var1, 2, 3], [imm1, 2, 3], [imm2, 2, 3], [var2, 2, 3]]
+        ovar1 = IntVar(values=[40, 43], symbolic_value=sym1 + sym2 + 17 + 19)
+        self._test_concatenate_shape(in_shapes, [ovar1, 2, 3], 0)
+
+    def test_concatenate_shape_compatible(self):
+        var1 = IntVar(values=[1, 2])
+        sym1 = var1._attrs["symbolic_value"]
+
+        in_shapes = [[var1, 2, 3], [var1, 2, 3]]
+        self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
+        dup_var1 = IntVar(values=[1, 2], symbolic_value=sym1)
+        in_shapes = [[var1, 2, 3], [dup_var1, 2, 3]]
+        self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
+        # var2 = IntVar(values=[1, 2])
+        # with self.assertRaises(RuntimeError):
+        #     in_shapes = [[var1, 2, 3], [var2, 2, 3]]
+        #     self._test_concatenate_shape(in_shapes, [var1, 2, 6], -1)
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_concatenate_tanh.py b/tests/unittest/ops/test_concatenate_tanh.py
index 2c24436a6..ef775f148 100644
--- a/tests/unittest/ops/test_concatenate_tanh.py
+++ b/tests/unittest/ops/test_concatenate_tanh.py
@@ -21,22 +21,28 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
-class ConcatenateTestCase(unittest.TestCase):
+class ConcatenateTanhTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
-        super(ConcatenateTestCase, self).__init__(*args, **kwargs)
+        super(ConcatenateTanhTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
 
     def _run_concatenate(
-        self, *, concatenate_op, input_shapes, dim=None, input_type="float16"
+        self,
+        *,
+        concatenate_op,
+        input_shapes,
+        dim=None,
+        test_name="concatenate_tanh_cat",
+        input_type="float16",
     ):
-        logging.info(
-            "Test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
+        logging.info(f"Test input shapes {input_shapes}, dim={dim}")
 
         # generate torch reference result
         input_tensors_pt = [
@@ -53,7 +59,10 @@ def _run_concatenate(
         target = detect_target()
         inputs = [
             Tensor(
-                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+                shape=shape,
+                dtype=input_type,
+                name=f"input_{i}",
+                is_input=True,
             )
             for i, shape in enumerate(input_shapes)
         ]
@@ -62,25 +71,29 @@ def _run_concatenate(
         Y._attrs["is_output"] = True
         y_shape = [d._attrs["values"][0] for d in Y._attrs["shape"]]
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info(f"AITemplate output_shape: {y_shape}")
 
-        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         input_tensors_ait = {
             f"input_{idx}": input_tensors_pt[idx] for idx in range(len(inputs))
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(input_tensors_ait, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
     def _run_batch_concatenate(
-        self, *, batch_sizes, concatenate_op, input_shapes, dim=0, input_type="float16"
+        self,
+        *,
+        batch_sizes,
+        concatenate_op,
+        input_shapes,
+        dim=0,
+        test_name="concatenate_tanh_batch_cat",
+        input_type="float16",
     ):
-        logging.info(
-            "Batch test input shapes {input_shapes}, dim={dim}".format(
-                input_shapes=input_shapes, dim=dim
-            )
-        )
+        logging.info(f"Batch test input shapes {input_shapes}, dim={dim}")
         batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_size")
         target = detect_target()
         inputs = [
@@ -90,7 +103,7 @@ def _run_batch_concatenate(
                     *shape,
                 ],
                 dtype=input_type,
-                name="input_{}".format(i),
+                name=f"input_{i}",
                 is_input=True,
             )
             for i, shape in enumerate(input_shapes)
@@ -98,10 +111,10 @@ def _run_batch_concatenate(
         Y = concatenate_op(inputs) if dim is None else concatenate_op(inputs, dim)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        batch_tag = "_".join([str(b) for b in batch_sizes])
-        module = compile_model(Y, target, "./tmp", f"concatenate_tanh_{batch_tag}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
         for batch in batch_sizes:
-            logging.info("checking batch: {}".format(batch))
+            logging.info(f"checking batch: {batch}")
             input_tensors_pt = [
                 get_random_torch_tensor([batch, *shape], input_type)
                 for i, shape in enumerate(input_shapes)
@@ -127,12 +140,11 @@ def _run_masked_concatenate(
         input_shapes,
         input_masks,
         dim=None,
+        test_name="concatenate_tanh_masked_cat",
         input_type="float16",
     ):
         logging.info(
-            "Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}".format(
-                input_shapes=input_shapes, input_masks=input_masks, dim=dim
-            )
+            f"Test input shapes {input_shapes}, input_masks={input_masks}, dim={dim}"
         )
 
         # generate torch reference result
@@ -150,7 +162,10 @@ def _run_masked_concatenate(
         target = detect_target()
         inputs = [
             Tensor(
-                shape=shape, dtype=input_type, name="input_{}".format(i), is_input=True
+                shape=shape,
+                dtype=input_type,
+                name=f"input_{i}",
+                is_input=True,
             )
             for i, shape in enumerate(input_shapes)
         ]
@@ -170,15 +185,17 @@ def _run_masked_concatenate(
         concatenate_op._attrs["inputs"] = inputs
         concatenate_op._attrs["input_accessors"] = input_accessors
 
-        logging.info("AITemplate output_shape: {}".format(y_shape))
+        logging.info(f"AITemplate output_shape: {y_shape}")
 
-        module = compile_model(Y, target, "./tmp", "concatenate_tanh")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         inputs = []
         for i, x_tensor_pt in enumerate(input_tensors_pt):
             if input_masks[i]:
                 inputs.append(x_tensor_pt)
-        y = torch.empty(y_shape).cuda().half()
+
+        y = get_torch_empty_tensor(y_shape, dtype=input_type)
         module.run_with_tensors(inputs, [y])
 
         split_sections = []
@@ -193,131 +210,225 @@ def _run_masked_concatenate(
             if mask is True:
                 np.testing.assert_allclose(actual, pt, atol=1e-2, rtol=1e-2)
 
-    def test_batch_cat(self):
+    def test_batch_cat_fp16(self):
         self._run_batch_concatenate(
             batch_sizes=[1, 1],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1], [1]),
             dim=0,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[1, 1],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1], [1]),
             dim=1,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=1,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=3,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 1, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
         )
         self._run_batch_concatenate(
             batch_sizes=[3, 5, 9],
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 2]),
             dim=3,
+            test_name="concatenate_tanh_batch_cat_fp16",
+            input_type="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_cat_fp32(self):
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[1, 1],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=1,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=0,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
+        )
+        self._run_batch_concatenate(
+            batch_sizes=[3, 5, 9],
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 4]),
+            dim=1,
+            test_name="concatenate_tanh_batch_cat_fp32",
+            input_type="float32",
         )
 
-    def test_cat(self):
+    def test_cat_fp16(self):
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1], [1]), dim=0
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=0
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([1, 1], [1, 1]), dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=([2, 1], [2, 1]), dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 1], [2, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
-            concatenate_op=ops.concatenate_tanh(), input_shapes=[[2, 3, 4]], dim=1
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=[[2, 3, 4]],
+            dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [3, 3, 4], [4, 3, 4]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 4, 4], [2, 5, 4]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 6], [2, 3, 5], [2, 3, 4]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1024, 32, 32], [1024, 16, 32], [1024, 8, 32]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([12, 3, 4, 5], [3, 3, 4, 5], [7, 3, 4, 5]),
             dim=0,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 5], [2, 3, 4, 5]),
             dim=1,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 9, 5], [2, 3, 4, 5], [2, 3, 1, 5]),
             dim=2,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4, 5], [2, 3, 4, 3], [2, 3, 4, 5]),
             dim=3,
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
-
         self._run_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]),
+            test_name="concatenate_tanh_cat_fp16",
+            input_type="float16",
         )
 
         # self._run_concatenate(concatenate_op=ops.concatenate(),
@@ -331,36 +442,120 @@ def test_cat(self):
         # self._run_concatenate(concatenate_op=ops.concatenate(),
         #                       input_shapes=([1, 3, 1], [2, 3, 1], [3, 3, 1]))
 
-    def test_masked_cat(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_cat_fp32(self):
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1], [1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=0,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1], [1, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+        self._run_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 1], [2, 1]),
+            dim=1,
+            test_name="concatenate_tanh_cat_fp32",
+            input_type="float32",
+        )
+
+    def test_masked_cat_fp16(self):
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2], [2]),
             input_masks=[True, False],
             dim=0,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3], [5, 3], [3, 3]),
             input_masks=[False, True, True],
             dim=0,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
             input_masks=[True, False, True],
             dim=1,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
             input_masks=[False, True, False],
             dim=2,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
         )
         self._run_masked_concatenate(
             concatenate_op=ops.concatenate_tanh(),
             input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 16]),
             input_masks=[False, True, False],
             dim=2,
+            test_name="concatenate_tanh_masked_cat_fp16",
+            input_type="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_masked_cat_fp32(self):
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2], [2]),
+            input_masks=[True, False],
+            dim=0,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3], [5, 3], [3, 3]),
+            input_masks=[False, True, True],
+            dim=0,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 11, 4], [2, 5, 4], [2, 2, 4]),
+            input_masks=[True, False, True],
+            dim=1,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([1, 1, 1], [1, 1, 2], [1, 1, 4]),
+            input_masks=[False, True, False],
+            dim=2,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
+        )
+        self._run_masked_concatenate(
+            concatenate_op=ops.concatenate_tanh(),
+            input_shapes=([2, 3, 4], [2, 3, 8], [2, 3, 32]),
+            input_masks=[False, True, False],
+            dim=2,
+            test_name="concatenate_tanh_masked_cat_fp32",
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_conv.py b/tests/unittest/ops/test_conv.py
index 7a0a3881c..db5621174 100644
--- a/tests/unittest/ops/test_conv.py
+++ b/tests/unittest/ops/test_conv.py
@@ -17,21 +17,37 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import IntImm, Tensor
+from aitemplate.frontend import IntImm, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.conv2d(stride=1, pad=1, dilate=1)
         if copy_op:
@@ -39,24 +55,134 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"conv2d_{copy_op}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-1, rtol=1e-1)
+            else:
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv2d(self, dtype):
+        self._test_conv(
+            test_name=f"conv2d_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv(
+            copy_op=True,
+            test_name=f"conv2d_{dtype}_copy_op",
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv1d(self, dtype):
+        self._test_conv1d(dtype=dtype, bias=False)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv1d_bias(self, dtype):
+        self._test_conv1d(dtype=dtype, bias=True)
+
+    def _test_conv1d(self, dtype, bias):
+        target = detect_target()
+        batch = 4
+        C_in = 80
+        C_out = 512
+        K = 3
+        L = 28
+        stride = 1
+        padding = 1
+        dilation = 1
+        test_name = "test_conv1d"
+
+        X_pt = get_random_torch_tensor([batch, C_in, L], dtype=dtype)
+        W_pt = get_random_torch_tensor([C_out, C_in, K], dtype=dtype)
+        bias_pt = get_random_torch_tensor([C_out], dtype=dtype) if bias else None
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        X = Tensor(
+            shape=[IntImm(batch), L, C_in],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        mod = nn.Conv1d(
+            in_channels=C_in,
+            out_channels=C_out,
+            kernel_size=K,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            dtype=dtype,
+            bias=bias,
+        )
+
+        Y = mod(X)
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+        module.set_constant_with_tensor(
+            "conv1d_weight", W_pt.permute((0, 2, 1)).contiguous()
+        )
+        if bias:
+            module.set_constant_with_tensor("conv1d_bias", bias_pt)
+        Y_pt = torch.nn.functional.conv1d(
+            X_pt.float(),
+            W_pt.float(),
+            bias=bias_pt.float() if bias else None,
+            padding=padding,
+            stride=stride,
+            dilation=dilation,
+        ).to(dtype=X_pt.dtype)
+
+        x = X_pt.permute((0, 2, 1)).contiguous()
+
+        y = torch.empty_like(Y_pt).permute((0, 2, 1)).contiguous()
+        module.run_with_tensors({"input_0": x}, [y])
+        y_transpose = y.permute((0, 2, 1))
+        if target.name() == "cuda":
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1.5e-1, rtol=1e-1)
+            else:
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+        else:
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv2d_bias_add.py b/tests/unittest/ops/test_conv2d_bias_add.py
index e0e541a50..2d0fabc58 100644
--- a/tests/unittest/ops/test_conv2d_bias_add.py
+++ b/tests/unittest/ops/test_conv2d_bias_add.py
@@ -15,27 +15,51 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_add(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -45,30 +69,52 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt + R_pt
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-1, rtol=5e-1)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float"), ("bfloat16")],
+            }
+        )
+    )
+    def test_conv2d_bias_add(self, dtype):
+        self._test_conv_bias_add(
+            test_name=f"conv2d_bias_add_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_add(
+            copy_op=True,
+            test_name=f"conv2d_bias_add_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv3d.py b/tests/unittest/ops/test_conv3d.py
index ea192241d..42e8e05a9 100644
--- a/tests/unittest/ops/test_conv3d.py
+++ b/tests/unittest/ops/test_conv3d.py
@@ -14,18 +14,83 @@
 #
 import unittest
 
-import numpy as np
-
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class Conv3dTestCase(unittest.TestCase):
+    def test_conv3d_bias_padding(
+        self,
+    ):
+        target = detect_target()
+        tt = 4
+        hh = 224
+        ww = 224
+        ci = 3
+        co = 96
+        kt = 3
+        kh = 5
+        kw = 5
+        stride = (2, 4, 4)
+        pad = (1, 2, 2)
+        dtype = "float16"
+
+        X = Tensor(
+            shape=[IntImm(4), tt, hh, ww, ci],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, kt, kh, kw, ci],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+
+        B = Tensor(
+            shape=[co],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+
+        Y = ops.conv3d_bias(stride=stride, pad=pad, dilate=1)(
+            ops.ndhwc3to8()(X), ops.ndhwc3to8()(W), B
+        )
+
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "conv3d_has_bias")
+
+        X_pt = get_random_torch_tensor([4, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, ci, kt, kh, kw], dtype=dtype)
+        B_pt = get_random_torch_tensor([co], dtype=dtype)
+
+        Y_pt = torch.nn.functional.conv3d(
+            X_pt, W_pt, bias=B_pt, stride=stride, padding=pad
+        )
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w}
+        inputs["input_2"] = B_pt
 
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
 
-@unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
-class ConvTestCase(unittest.TestCase):
-    def _test_fp16(
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def _test_conv3d(
         self,
         tt,
         hh,
@@ -38,50 +103,186 @@ def _test_fp16(
         stride=(1, 1, 1),
         pad=(1, 1, 1),
         batch=4,
-        test_case="",
+        has_bias=False,
+        test_name="conv3d",
+        dtype="float16",
     ):
         target = detect_target()
 
         X = Tensor(
             shape=[IntImm(batch), tt, hh, ww, ci],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[co, kt, kh, kw, ci], dtype="float16", name="input_1", is_input=True
+            shape=[co, kt, kh, kw, ci],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
-        OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
-        Y = OP(X, W)
+        if has_bias:
+            B = Tensor(
+                shape=[co],
+                dtype=dtype,
+                name="input_2",
+                is_input=True,
+            )
+
+        if has_bias:
+            OP = ops.conv3d_bias(stride=stride, pad=pad, dilate=1)
+            Y = OP(X, W, B)
+        else:
+            OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
+            Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"conv3d_{test_case}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{has_bias}")
+
+        X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, ci, kt, kh, kw], dtype=dtype)
+        B_pt = get_random_torch_tensor([co], dtype=dtype) if has_bias else None
 
-        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
-        W_pt = torch.randn(co, ci, kt, kh, kw).cuda().half()
-        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, stride=stride, padding=pad)
+        Y_pt = torch.nn.functional.conv3d(
+            X_pt, W_pt, bias=B_pt, stride=stride, padding=pad
+        )
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+        inputs = {"input_0": x, "input_1": w}
+        if has_bias:
+            inputs["input_2"] = B_pt
+        module.run_with_tensors(inputs, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
 
-        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
-        y_shape = list(Y_pt_transpose.shape)
-        y = torch.empty(y_shape).cuda().half()
-        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
-
-        np.testing.assert_allclose(
-            Y_pt_transpose.cpu().numpy(), y.cpu().numpy(), atol=1e-2, rtol=1e-2
-        )
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16(
-            4, 224, 224, 8, 96, 3, 5, 5, stride=(2, 4, 4), pad=(1, 2, 2), test_case=1
+        for has_bias in [True, False]:
+            self._test_conv3d(
+                4,
+                224,
+                224,
+                8,
+                96,
+                3,
+                5,
+                5,
+                stride=(2, 4, 4),
+                pad=(1, 2, 2),
+                test_name="conv3d_fp16_1",
+                dtype="float16",
+                has_bias=has_bias,
+            )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            256,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_2",
+            dtype="float16",
         )
-        self._test_fp16(56, 56, 56, 64, 256, 1, 1, 1, test_case=2)
-        self._test_fp16(56, 56, 56, 64, 64, 1, 1, 1, test_case=3)
-        self._test_fp16(56, 56, 56, 64, 64, 3, 3, 3, test_case=4)
-        self._test_fp16(56, 56, 56, 256, 64, 1, 1, 1, test_case=5)
-        self._test_fp16(56, 56, 56, 256, 512, 1, 1, 1, stride=(2, 2, 2), test_case=6)
-        self._test_fp16(56, 56, 56, 128, 128, 3, 3, 3, stride=(2, 2, 2), test_case=7)
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            64,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_3",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            64,
+            64,
+            3,
+            3,
+            3,
+            test_name="conv3d_fp16_4",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            256,
+            64,
+            1,
+            1,
+            1,
+            test_name="conv3d_fp16_5",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            256,
+            512,
+            1,
+            1,
+            1,
+            stride=(2, 2, 2),
+            test_name="conv3d_fp16_6",
+            dtype="float16",
+        )
+        self._test_conv3d(
+            56,
+            56,
+            56,
+            128,
+            128,
+            3,
+            3,
+            3,
+            stride=(2, 2, 2),
+            test_name="conv3d_fp16_7",
+            dtype="float16",
+        )
+
+    # !!! SKIPPED TESTS BELOW !!!
+    # CUTLASS generator doesn't provide conv3d ops for fp32
+    # TODO: enable the tests after the issue is resolved
+
+    # def test_fp32(self):
+    #     self._test_conv3d(
+    #         4,
+    #         224,
+    #         224,
+    #         8,
+    #         96,
+    #         3,
+    #         5,
+    #         5,
+    #         stride=(2, 4, 4),
+    #         pad=(1, 2, 2),
+    #         test_name="conv3d_fp32_1",
+    #         dtype="float32",
+    #     )
+    #     self._test_conv3d(
+    #         56,
+    #         56,
+    #         56,
+    #         64,
+    #         256,
+    #         1,
+    #         1,
+    #         1,
+    #         test_name="conv3d_fp32_2",
+    #         dtype="float32",
+    #     )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv3d_profiler_cache.py b/tests/unittest/ops/test_conv3d_profiler_cache.py
new file mode 100644
index 000000000..48e7622f8
--- /dev/null
+++ b/tests/unittest/ops/test_conv3d_profiler_cache.py
@@ -0,0 +1,257 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target()._arch == "75", "Conv3d not supported on sm75.")
+class Conv3DProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="conv3d",
+        tt=56,
+        hh=56,
+        ww=56,
+        ci=64,
+        co=256,
+        kt=1,
+        kh=1,
+        kw=1,
+        stride=(1, 1, 1),
+        pad=(1, 1, 1),
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, tt, hh, ww, ci],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[co, kt, kh, kw, ci],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name,
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+        cache_dir,
+    ):
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+
+    def test_conv3d_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
+
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_conv3d_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "conv3d_cache_version"
+        target_name = detect_target().name()
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
+
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv3d_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
+
+    def test_conv3d_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv3d_profiler_force_cache"
+        cache_version_property = "conv3d_cache_version"
+
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+    def test_conv3d_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv3d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
+
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run2_logs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_conv_bias.py b/tests/unittest/ops/test_conv_bias.py
index c1b0d16a0..bb0e774c4 100644
--- a/tests/unittest/ops/test_conv_bias.py
+++ b/tests/unittest/ops/test_conv_bias.py
@@ -19,48 +19,94 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(1)
+
+    def _test_conv_bias(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            else:
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv2d_bias(self, dtype):
+        self._test_conv_bias(
+            test_name=f"conv2d_bias_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias(
+            copy_op=True,
+            test_name=f"conv2d_bias_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_act_few_channels.py b/tests/unittest/ops/test_conv_bias_act_few_channels.py
index f284d1111..f886b7fd1 100644
--- a/tests/unittest/ops/test_conv_bias_act_few_channels.py
+++ b/tests/unittest/ops/test_conv_bias_act_few_channels.py
@@ -19,6 +19,9 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -27,88 +30,160 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvBiasReluTestCase(unittest.TestCase):
-    def _test_relu(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class ConvBiasActFewChannelsTestCase(unittest.TestCase):
+    def _test_conv_bias_relu_few_channels(
+        self,
+        HH=224,
+        WW=224,
+        CI=4,
+        CO=64,
+        batch=1,
+        copy_op=False,
+        test_name="conv2d_bias_relu_few_channels",
+        dtype="float16",
+    ):
         KK = 7
         stride = 2
         pad = 3
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+            shape=[CO, KK, KK, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu_few_channels(stride=stride, pad=pad, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_relu_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_relu_few_channels")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, KK, KK], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_relu(self):
-        self._test_relu()
-        self._test_relu(copy_op=True)
+    @parameterized.expand(
+        [
+            ("float16"),
+            ("float32"),
+        ]
+    )
+    def test_relu(self, dtype):
+        self._test_conv_bias_relu_few_channels(
+            test_name=f"conv_bias_relu_few_channels_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_relu_few_channels(
+            copy_op=True,
+            test_name=f"conv_bias_relu_few_channels_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
-    def _test_hardswish(self, HH=224, WW=224, CI=4, CO=64, batch=1, copy_op=False):
+    def _test_conv_bias_hardswish_few_channels(
+        self,
+        HH=224,
+        WW=224,
+        CI=4,
+        CO=64,
+        batch=1,
+        copy_op=False,
+        test_name="conv2d_bias_hardswish_few_channels",
+        dtype="float16",
+    ):
         KK = 7
         stride = 2
         pad = 3
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[CO, KK, KK, CI], dtype="float16", name="input_1", is_input=True
+            shape=[CO, KK, KK, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish_few_channels(stride=stride, pad=pad, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_hardswish_few_channels(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish_few_channels")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, KK, KK).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, KK, KK], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=pad, stride=stride)
         Y_pt = Y_pt + B_pt
         Y_pt = hard_swish(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, HH // stride, WW // stride, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_hardswish(self):
-        self._test_hardswish()
-        self._test_hardswish(copy_op=True)
+    @parameterized.expand(
+        [
+            ("float16"),
+            ("float32"),
+        ]
+    )
+    def test_hardswish(self, dtype):
+        self._test_conv_bias_hardswish_few_channels(
+            test_name=f"conv_bias_hardswish_few_channels_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_hardswish_few_channels(
+            copy_op=True,
+            test_name=f"conv_bias_hardswish_few_channels_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_add_hardswish.py b/tests/unittest/ops/test_conv_bias_add_hardswish.py
index d390d4303..d97e397a8 100644
--- a/tests/unittest/ops/test_conv_bias_add_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_add_hardswish.py
@@ -15,9 +15,17 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -26,22 +34,38 @@ def hard_swish(x):
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvBiasHardswishAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+class ConvBiasAddHardswishTestCase(unittest.TestCase):
+    def _test_conv_bias_add_hardswish(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add_hardswish",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -51,12 +75,12 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_hardswish")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
         Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
         Y_pt = Y_pt + B_pt + R_pt
         Y_pt = hard_swish(Y_pt)
@@ -65,14 +89,32 @@ def _test_fp16(self, batch=4, copy_op=False):
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_conv_bias_add_hardswish(self, dtype):
+        self._test_conv_bias_add_hardswish(
+            test_name=f"conv2d_bias_add_hardswish_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_add_hardswish(
+            copy_op=True,
+            test_name=f"conv2d_bias_add_hardswish_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_add_relu.py b/tests/unittest/ops/test_conv_bias_add_relu.py
index f13474835..bdc78e650 100644
--- a/tests/unittest/ops/test_conv_bias_add_relu.py
+++ b/tests/unittest/ops/test_conv_bias_add_relu.py
@@ -15,27 +15,52 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
-class ConvBiasReluAddTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class ConvBiasAddReluTestCase(unittest.TestCase):
+    def _test_conv_bias_add_relu(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_add_relu",
+        dtype="float16",
+    ):
         target = detect_target()
         CO, HH, WW, CI = 256, 28, 28, 128
         X = Tensor(
             shape=[IntImm(batch), HH, WW, CI],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
-        W = Tensor(shape=[CO, 3, 3, CI], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[CO], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(
+            shape=[CO, 3, 3, CI],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[CO],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         R = Tensor(
             shape=[IntImm(batch), HH, WW, CO],
-            dtype="float16",
+            dtype=dtype,
             name="input_3",
             is_input=True,
         )
@@ -45,13 +70,15 @@ def _test_fp16(self, batch=4, copy_op=False):
         Y = OP(X, W, B, R)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv2d_bias_add_relu")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, CI, HH, WW).cuda().half()
-        W_pt = torch.randn(CO, CI, 3, 3).cuda().half()
-        B_pt = torch.randn(1, CO, 1, 1).cuda().half()
-        R_pt = torch.randn(batch, CO, HH, WW).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, CI, HH, WW], dtype=dtype)
+        W_pt = get_random_torch_tensor([CO, CI, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, CO, 1, 1], dtype=dtype)
+        R_pt = get_random_torch_tensor([batch, CO, HH, WW], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt + R_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
 
@@ -59,18 +86,37 @@ def _test_fp16(self, batch=4, copy_op=False):
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         r = R_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze(), "input_3": r}
-        y = torch.empty([batch, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute(0, 3, 1, 2)
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
-
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        if target.name() == "cuda":
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-1, rtol=5e-1)
+        else:
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+            }
+        )
+    )
+    def test_conv2d_bias_add_relu(self, dtype):
+        self._test_conv_bias_add_relu(
+            test_name=f"conv2d_bias_add_relu_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_add_relu(
+            copy_op=True,
+            test_name=f"conv2d_bias_add_relu_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_hardswish.py b/tests/unittest/ops/test_conv_bias_hardswish.py
index 6a424b1af..67fe90872 100644
--- a/tests/unittest/ops/test_conv_bias_hardswish.py
+++ b/tests/unittest/ops/test_conv_bias_hardswish.py
@@ -19,6 +19,13 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 def hard_swish(x):
@@ -28,30 +35,46 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class ConvBiasHardswishTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_hardswish(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_hardswish",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_hardswish(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_hardswish(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_hardswish")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = hard_swish(Y_pt)
         # np.savetxt("y.txt", Y_np.flatten())
@@ -62,14 +85,34 @@ def _test_fp16(self, batch=4, copy_op=False):
         # np.savetxt("x.txt", x.flatten())
         # np.savetxt("w.txt", w.flatten())
         # np.savetxt("b.txt", b.flatten())
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+        elif dtype == "float16":
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+        elif dtype == "bfloat16":
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1, rtol=1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+            }
+        )
+    )
+    def test_conv2d_bias_hardswish(self, dtype):
+        self._test_conv_bias_hardswish(
+            test_name=f"conv2d_bias_hardswish_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_hardswish(
+            copy_op=True,
+            test_name=f"conv2d_bias_hardswish_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_relu.py b/tests/unittest/ops/test_conv_bias_relu.py
index 1ab18b4ff..4feb7279a 100644
--- a/tests/unittest/ops/test_conv_bias_relu.py
+++ b/tests/unittest/ops/test_conv_bias_relu.py
@@ -19,49 +19,97 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasReluTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_conv_bias_relu(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_relu",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_relu(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_relu(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_relu")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = torch.nn.functional.relu(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            elif dtype == "float16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
+            elif dtype == "bfloat16":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=2e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv2d_bias_relu(self, dtype):
+        self._test_conv_bias_relu(
+            test_name=f"conv2d_bias_relu_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_relu(
+            copy_op=True,
+            test_name=f"conv2d_bias_relu_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_bias_sigmoid.py b/tests/unittest/ops/test_conv_bias_sigmoid.py
index c9e3ad3f6..b4567cacd 100644
--- a/tests/unittest/ops/test_conv_bias_sigmoid.py
+++ b/tests/unittest/ops/test_conv_bias_sigmoid.py
@@ -19,49 +19,91 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+
+from parameterized import parameterized
 
 
 class ConvBiasSigmoidTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+    def _test_conv_bias_sigmoid(
+        self,
+        batch=4,
+        copy_op=False,
+        test_name="conv2d_bias_sigmoid",
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 128],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 3, 3, 128], dtype="float16", name="input_1", is_input=True
+            shape=[256, 3, 3, 128],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.conv2d_bias_sigmoid(stride=1, pad=1, dilate=1)
         if copy_op:
             OP = ops.conv2d_bias_sigmoid(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "conv_bias_sigmoid")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 128, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 128, 3, 3).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
-        Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, padding=1)
+        X_pt = get_random_torch_tensor([batch, 128, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 128, 3, 3], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
+        Y_pt = torch.nn.functional.conv2d(X_pt.float(), W_pt.float(), padding=1).to(
+            dtype=X_pt.dtype
+        )
         Y_pt = Y_pt + B_pt
         Y_pt = torch.sigmoid(Y_pt)
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
         inputs = {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                torch.testing.assert_close(Y_pt, y_transpose, atol=5e-2, rtol=1e-2)
+            else:
+                torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
-    def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("bfloat16"), ("float32")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_conv2d_bias_sigmoid(self, dtype):
+        self._test_conv_bias_sigmoid(
+            test_name=f"conv2d_bias_sigmoid_{dtype}",
+            dtype=dtype,
+        )
+        self._test_conv_bias_sigmoid(
+            copy_op=True,
+            test_name=f"conv2d_bias_sigmoid_{dtype}_copy_op",
+            dtype=dtype,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_depthwise.py b/tests/unittest/ops/test_conv_depthwise.py
index 8f8708a78..799dc698e 100644
--- a/tests/unittest/ops/test_conv_depthwise.py
+++ b/tests/unittest/ops/test_conv_depthwise.py
@@ -50,9 +50,9 @@ def test_fp16(self, batch=4):
         self.assertFalse(y_transpose.isnan().any())
         self.assertFalse(y_transpose.isinf().any())
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1e-2, rtol=1e-2)
         else:
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_conv_profiler_cache.py b/tests/unittest/ops/test_conv_profiler_cache.py
new file mode 100644
index 000000000..94202c991
--- /dev/null
+++ b/tests/unittest/ops/test_conv_profiler_cache.py
@@ -0,0 +1,246 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import DynamicProfileStrategy
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class ConvProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="conv2d",
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, 28, 28, 128],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[256, 3, 3, 128],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.conv2d(stride=1, pad=1, dilate=1)
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name,
+                dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+        cache_dir,
+    ):
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+
+    def test_conv_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
+
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_conv_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "conv_cache_version"
+        target_name = detect_target().name()
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
+
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_conv_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
+
+    def test_conv_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "conv2d_profiler_force_cache"
+        cache_version_property = "conv_cache_version"
+
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+    def test_conv_profiler_cache_dynamic(self):
+        first_dim = IntVar([2, 8])
+        test_name = "conv2d_profiler_cache_dynamic"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
+
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run2_logs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_cross_attention.py b/tests/unittest/ops/test_cross_attention.py
index 13f2f0eff..86a8e6e23 100644
--- a/tests/unittest/ops/test_cross_attention.py
+++ b/tests/unittest/ops/test_cross_attention.py
@@ -33,6 +33,14 @@ def mark_output(y):
 
 
 class crossattentionTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_id = 0
+
     def _test_mha(
         self,
         batch_sizes,
@@ -94,7 +102,10 @@ def _test_mha(
         Y = Y + inputs_ait
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        exe_module = compile_model(
+            Y, target, "./tmp", f"cross_attn_dynamic_{self.test_id}"
+        )
+        self.test_id += 1
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
@@ -123,6 +134,10 @@ def _test_mha(
             )
             print("Batch {} MHA verification pass".format(batch_size))
 
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "Not supported by cuda sm<80",
+    )
     def test_cross_attn(self):
         self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
         self._test_mha(
@@ -131,8 +146,8 @@ def test_cross_attn(self):
         self._test_mha(
             batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
         )
+        self._test_mha(batch_sizes=[128], seqlen=1, seqlen_kv=4, dim=16, num_heads=2)
 
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_depthwise_conv3d.py b/tests/unittest/ops/test_depthwise_conv3d.py
index c9d46d943..d0948a9da 100644
--- a/tests/unittest/ops/test_depthwise_conv3d.py
+++ b/tests/unittest/ops/test_depthwise_conv3d.py
@@ -19,44 +19,106 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class DepthwiseConv3dTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+class Conv3dDepthwiseTestCase(unittest.TestCase):
+    def _test_depthwise_conv3d(
+        self,
+        batch=4,
+        bias=False,
+        copy_op=False,
+        test_name="depthwise_conv3d",
+        dtype="float16",
+    ):
         target = detect_target()
         tt, hh, ww, ci, co, groups = 28, 28, 28, 128, 128, 128
         X = Tensor(
             shape=[IntImm(batch), tt, hh, ww, ci],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[co, 3, 3, 3, 1], dtype="float16", name="input_1", is_input=True
+            shape=[co, 3, 3, 3, 1],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
-        OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups)
+        if bias:
+            B = Tensor(
+                shape=[co],
+                dtype=dtype,
+                name="input_2",
+                is_input=True,
+            )
+
+        OP = ops.depthwise_conv3d(stride=1, pad=1, dilate=1, group=groups, bias=bias)
         if copy_op:
             OP = ops.depthwise_conv3d(**OP._get_op_attributes())
-        Y = OP(X, W)
+        Y = OP(X, W, B) if bias else OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"depthwise_conv3d_{copy_op}")
-
-        X_pt = torch.randn(batch, ci, tt, hh, ww).cuda().half()
-        W_pt = torch.randn(co, 1, 3, 3, 3).cuda().half()
-        Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        X_pt = get_random_torch_tensor([batch, ci, tt, hh, ww], dtype=dtype)
+        W_pt = get_random_torch_tensor([co, 1, 3, 3, 3], dtype=dtype)
+        if bias:
+            bias_pt = get_random_torch_tensor([co], dtype=dtype)
+            Y_pt = torch.nn.functional.conv3d(
+                X_pt, W_pt, bias_pt, padding=1, groups=groups
+            )
+        else:
+            Y_pt = torch.nn.functional.conv3d(X_pt, W_pt, padding=1, groups=groups)
         x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 4, 1)).contiguous()
-        y = torch.empty([batch, tt, hh, ww, co]).cuda().half()
-        module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+        if bias:
+            module.run_with_tensors(
+                {"input_0": x, "input_1": w, "input_2": bias_pt}, [y]
+            )
+        else:
+            module.run_with_tensors({"input_0": x, "input_1": w}, [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
 
-        Y_pt_transpose = Y_pt.permute(0, 2, 3, 4, 1)
-        self.assertTrue(torch.allclose(Y_pt_transpose, y, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_fp16",
+            dtype="float16",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            test_name="depthwise_conv3d_fp16",
+            dtype="float16",
+        )
+
+    def test_fp16_bias(self):
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_bias_fp16",
+            bias=True,
+            dtype="float16",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            bias=True,
+            test_name="depthwise_conv3d_bias_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_fp32(self):
+        self._test_depthwise_conv3d(
+            test_name="depthwise_conv3d_fp32",
+            dtype="float32",
+        )
+        self._test_depthwise_conv3d(
+            copy_op=True,
+            test_name="depthwise_conv3d_fp32",
+            dtype="float32",
+        )
 
     def _test_mvit_shape(
         self,
@@ -116,6 +178,7 @@ def test_mvit(self):
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (1, 4, 4), "5")
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (3, 3, 3), (2, 8, 8), "6")
         self._test_mvit_shape(128, 2, 56, 56, 96, 96, 96, (1, 3, 3), (2, 8, 8), "7")
+        self._test_mvit_shape(128, 2, 56, 56, 3, 3, 3, (1, 3, 3), (2, 8, 8), "7")
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_dual_bmm.py b/tests/unittest/ops/test_dual_bmm.py
new file mode 100644
index 000000000..6772adc8e
--- /dev/null
+++ b/tests/unittest/ops/test_dual_bmm.py
@@ -0,0 +1,231 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(detect_target()._arch == "75", "DualGemm not supported on sm75.")
+class DUALBMMTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_dual_bmm_rrr_div(
+        self,
+        B=256,
+        M=256,
+        N=512,
+        K=512,
+        broadcast_b1=False,
+        benchmark=False,
+        use_fp16_acc=False,
+        test_name="dual_bmm",
+        dtype="float16",
+    ):
+        B1_shape = [B, K, 1] if broadcast_b1 else [B, K, N]
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        X = Tensor(
+            shape=[B, M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        B0 = Tensor(
+            shape=[B, K, N],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B1 = Tensor(
+            shape=B1_shape,
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
+        OP = ops.dual_bmm_rrr_div()
+        Y = OP(X, B0, B1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        X_pt = get_random_torch_tensor([B, M, K], dtype=dtype) + 1.0
+        B0_pt = get_random_torch_tensor([B, K, N], dtype=dtype) + 1.0
+        B1_pt = get_random_torch_tensor(B1_shape, dtype=dtype) + 1.0
+
+        def pt_func(X_pt, W_pt, B_pt):
+            Y_pt1 = torch.bmm(X_pt, W_pt)
+            Y_pt2 = torch.bmm(X_pt, B_pt)
+            Y_pt = Y_pt1 / Y_pt2
+            return Y_pt
+
+        Y_pt = pt_func(X_pt, B0_pt, B1_pt)
+
+        inputs = {"input_0": X_pt, "input_1": B0_pt, "input_2": B1_pt}
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors(inputs, [y])
+
+        torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
+
+        if benchmark:
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors(inputs, [y])
+            # Benchmark AIT
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                inputs,
+                [y],
+                count=100,
+            )
+            _LOGGER.info(f"[{M}, {N}, {K}] AIT BMMxBMM time: {time_per_iter_ms:.5f}ms")
+            # Benchmark PT
+            from aitemplate.testing.benchmark_pt import benchmark_torch_function
+
+            func = pt_func
+            args = (X_pt, B0_pt, B1_pt)
+            duration = benchmark_torch_function(100, func, *args)
+            _LOGGER.info(f"PT BMMxBMM Time: {duration:.5f}ms")
+
+    def test_dual_bmm_rrr_div_fp16(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16_1",
+            dtype="float16",
+        )
+        self._test_dual_bmm_rrr_div(
+            B=512,
+            M=256,
+            N=512,
+            K=512,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16_2",
+            dtype="float16",
+        )
+        self._test_dual_bmm_rrr_div(
+            B=64,
+            M=1024,
+            N=1024,
+            K=2048,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp16_3",
+            dtype="float16",
+        )
+
+    def test_dual_bmm_rrr_div_broadcast_b1_fp16(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=True,
+            test_name="dual_bmm_rrr_div_broadcast_b1_fp16_1",
+            dtype="float16",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp16_2",
+        #     dtype="float16",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp16_3",
+        #     dtype="float16",
+        # )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_bmm_rrr_div_fp32(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=False,
+            test_name="dual_bmm_rrr_div_fp32_1",
+            dtype="float32",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=False,
+        #     test_name="dual_bmm_rrr_div_fp32_2",
+        #     dtype="float32",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=False,
+        #     test_name="dual_bmm_rrr_div_fp32_3",
+        #     dtype="float32",
+        # )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_bmm_rrr_div_broadcast_b1_fp32(self):
+        self._test_dual_bmm_rrr_div(
+            B=37,
+            M=63,
+            N=64,
+            K=128,
+            broadcast_b1=True,
+            test_name="dual_bmm_rrr_div_broadcast_b1_fp32_1",
+            dtype="float32",
+        )
+        # self._test_dual_bmm_rrr_div(
+        #     B=512,
+        #     M=256,
+        #     N=512,
+        #     K=512,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp32_2",
+        #     dtype="float32",
+        # )
+        # self._test_dual_bmm_rrr_div(
+        #     B=64,
+        #     M=1024,
+        #     N=1024,
+        #     K=2048,
+        #     broadcast_b1=True,
+        #     test_name="dual_bmm_rrr_div_broadcast_b1_fp32_3",
+        #     dtype="float32",
+        # )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_dual_gemm.py b/tests/unittest/ops/test_dual_gemm.py
index 28b25bfda..569910ed8 100644
--- a/tests/unittest/ops/test_dual_gemm.py
+++ b/tests/unittest/ops/test_dual_gemm.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import math
 import unittest
 
@@ -21,7 +22,12 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 class NewGELUActivation(torch.nn.Module):
@@ -71,16 +77,46 @@ def mark_output(y):
         y[i]._attrs["is_output"] = True
         y[i]._attrs["name"] = "output_%d" % (i)
         y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
-        print("output_{} shape: {}".format(i, y_shape))
+        print(f"output_{i} shape: {y_shape}")
 
 
 @unittest.skipIf(detect_target()._arch == "75", "DualGemm not supported on sm75.")
 class DUALGEMMTestCase(unittest.TestCase):
-    def _test_dual_gemm(self, M=4096, N=4096, K=8192, fast_gelu=False, benchmark=False):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_dual_gemm(
+        self,
+        M=4096,
+        N=4096,
+        K=8192,
+        fast_gelu=False,
+        benchmark=False,
+        broadcast_b1=False,
+        test_name="dual_gemm",
+        dtype="float16",
+    ):
+        B_shape = [1, K] if broadcast_b1 else [N, K]
         target = detect_target(use_fp16_acc=False)
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N, K], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=B_shape,
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         if fast_gelu:
             OP = ops.dual_gemm_rcr_fast_gelu()
         else:
@@ -88,10 +124,11 @@ def _test_dual_gemm(self, M=4096, N=4096, K=8192, fast_gelu=False, benchmark=Fal
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "dual_gemm")
-        X_pt = torch.randn(M, K).cuda().half() * 0.01
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N, K).cuda().half()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype) * 0.01
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor(B_shape, dtype=dtype)
 
         def pt_func(X_pt, W_pt, B_pt):
             Y_pt1 = torch.nn.functional.linear(X_pt, W_pt)
@@ -106,7 +143,7 @@ def pt_func(X_pt, W_pt, B_pt):
         Y_pt = pt_func(X_pt, W_pt, B_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
@@ -121,20 +158,142 @@ def pt_func(X_pt, W_pt, B_pt):
                 [y],
                 count=100,
             )
-            logger.info(__file__, f"AIT GEMMxGEMM time: {time_per_iter_ms:.5f}ms")
+            _LOGGER.info(f"AIT GEMMxGEMM time: {time_per_iter_ms:.5f}ms")
             # Benchmark PT
             from aitemplate.testing.benchmark_pt import benchmark_torch_function
 
             func = pt_func
             args = (X_pt, W_pt, B_pt)
             duration = benchmark_torch_function(100, func, *args)
-            logger.info(__file__, f"PT GEMMxGEMM Time: {duration:.5f}ms")
+            _LOGGER.info(f"PT GEMMxGEMM Time: {duration:.5f}ms")
 
-    def test_dual_gemm(self):
-        for fast_gelu in [True, False]:
-            self._test_dual_gemm(M=128, N=128, K=256, fast_gelu=fast_gelu)
-            self._test_dual_gemm(M=1024, N=1024, K=2048, fast_gelu=fast_gelu)
-            self._test_dual_gemm(M=4096, N=4096, K=8192, fast_gelu=fast_gelu)
+    def test_dual_gemm_silu_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16_1",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=1024,
+            N=1024,
+            K=2048,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16_2",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=4096,
+            N=4096,
+            K=8192,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp16_3",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_silu_broadcast_b1_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=True,
+            test_name="dual_gemm_silu_broadcast_b1_fp16",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_fast_gelu_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16_1",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=1024,
+            N=1024,
+            K=2048,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16_2",
+            dtype="float16",
+        )
+        self._test_dual_gemm(
+            M=4096,
+            N=4096,
+            K=8192,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp16_3",
+            dtype="float16",
+        )
+
+    def test_dual_gemm_fast_gelu_broadcast_b1_fp16(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=True,
+            test_name="dual_gemm_fast_gelu_broadcast_b1_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_silu_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=False,
+            test_name="dual_gemm_silu_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_silu_broadcast_b1_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=False,
+            broadcast_b1=True,
+            test_name="dual_gemm_silu_broadcast_b1_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_fast_gelu_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=False,
+            test_name="dual_gemm_fast_gelu_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dual_gemm_fast_gelu_broadcast_b1_fp32(self):
+        self._test_dual_gemm(
+            M=128,
+            N=128,
+            K=256,
+            fast_gelu=True,
+            broadcast_b1=True,
+            test_name="dual_gemm_fast_gelu_broadcast_b1_fp32",
+            dtype="float32",
+        )
 
     def _test_t5block(
         self,
@@ -142,9 +301,13 @@ def _test_t5block(
         d_model=1024,
         d_ff=2048,
         use_fp16_acc=False,
+        test_name="t5block",
+        dtype="float16",
     ):
-
-        pt_mod = T5DenseGatedGeluDense(d_model=d_model, d_ff=d_ff).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        pt_mod = (
+            T5DenseGatedGeluDense(d_model=d_model, d_ff=d_ff).to(torch_dtype).cuda()
+        )
         pt_mod = pt_mod.eval()
 
         pt_params = dict(pt_mod.named_parameters())
@@ -156,25 +319,32 @@ def _test_t5block(
         ait_mod = nn.T5DenseGatedGeluDense(
             in_channels=d_model,
             out_channels=d_ff,
+            dtype=dtype,
         )
         ait_mod.name_parameter_tensor()
 
         M_dim = shape_utils.gen_int_var_min_max(Ms, name="Mdim")
-        inputs_ait = Tensor([M_dim, d_model], name="input0", is_input=True)
+        inputs_ait = Tensor(
+            [M_dim, d_model],
+            name="input0",
+            is_input=True,
+            dtype=dtype,
+        )
         Y = ait_mod(inputs_ait)
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "t5block")
+        exe_module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
         for m in Ms:
-            input_pt = torch.randn([m, d_model]).cuda().half()
+            input_pt = get_random_torch_tensor([m, d_model], dtype)
             pt_ys = pt_mod(input_pt)
             print("pt output:", pt_ys.shape)
 
             inputs = [input_pt]
-            ys = [torch.empty(pt_ys.shape).cuda().half()]
+            ys = [torch.empty_like(pt_ys)]
             exe_module.run_with_tensors(inputs, ys)
             eps = 1e-2
             np.testing.assert_allclose(
@@ -183,11 +353,24 @@ def _test_t5block(
                 atol=eps,
                 rtol=eps,
             )
-            print("M = {} t5 verification pass".format(m))
+            print(f"M = {m} t5 verification pass")
+
+    def test_t5block_fp16(self):
+        self._test_t5block(
+            Ms=[1024, 2048, 4096],
+            test_name="t5block_fp16",
+            dtype="float16",
+        )
 
-    def test_t5block(self):
-        self._test_t5block(Ms=[1024, 2048, 4096])
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_t5block_fp32(self):
+        self._test_t5block(
+            Ms=[1024, 2048, 4096],
+            test_name="t5block_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_dynamic_conv.py b/tests/unittest/ops/test_dynamic_conv.py
index 14b77a440..7835e5938 100644
--- a/tests/unittest/ops/test_dynamic_conv.py
+++ b/tests/unittest/ops/test_dynamic_conv.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 #
 
+import itertools
 import unittest
 
 import torch
@@ -21,20 +22,37 @@
 from aitemplate.compiler.base import DynamicProfileStrategy
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class ConvTestCase(unittest.TestCase):
-    def test_fp16(self):
+class ConvDynamicTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_conv_dynamic(
+        self,
+        test_name="conv_dynamic",
+        dtype="float16",
+    ):
         target = detect_target()
         batch_size = [2, 32]
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 24, 24, 4],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[36, 3, 3, 4], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(
+            shape=[36, 3, 3, 4],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.conv2d(stride=2, pad=1, dilate=1)
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
@@ -43,22 +61,183 @@ def test_fp16(self):
             Y,
             target,
             "./tmp",
-            "dynamic_conv",
+            test_name,
             dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
         )
         for batch in batch_size:
             print("Test batch: %d" % batch)
-            X_pt = torch.randn(batch, 4, 24, 24).cuda().half()
-            W_pt = torch.randn(36, 4, 3, 3).cuda().half()
+            X_pt = get_random_torch_tensor([batch, 4, 24, 24], dtype=dtype)
+            W_pt = get_random_torch_tensor([36, 4, 3, 3], dtype=dtype)
             Y_pt = torch.nn.functional.conv2d(X_pt, W_pt, stride=2, padding=1)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
             w = W_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty([batch, 12, 12, 36]).cuda().half()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors({"input_0": x, "input_1": w}, [y])
             y_transpose = y.permute((0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
+
+    def test_fp16(self):
+        self._test_conv_dynamic(
+            test_name="conv_dynamic_fp16",
+            dtype="float16",
+        )
+
+    def test_fp32_sm80(self):
+        self._test_conv_dynamic(
+            test_name="conv_dynamic_fp32",
+            dtype="float32",
+        )
+
+    def _test_conv2d_dynamic(
+        self,
+        test_name,
+        dtype="float16",
+    ):
+        target = detect_target()
+        batch_size = [2, 32]
+        h_size = [3, 24]
+        w_size = [3, 24]
+        X = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch"),
+                IntVar(values=h_size, name="input_height"),
+                IntVar(values=w_size, name="input_width"),
+                4,
+            ],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[12, 3, 3, 4],
+            dtype=dtype,
+            name="weight_1",
+            is_input=True,
+        )
+        W2 = Tensor(
+            shape=[36, 3, 3, 12],
+            dtype=dtype,
+            name="weight_2",
+            is_input=True,
+        )
+        conv_op1 = ops.conv2d(stride=2, pad=1, dilate=1)
+        Y1 = conv_op1(X, W1)
+        conv_op2 = ops.conv2d(stride=2, pad=1, dilate=1)
+        Y = conv_op2(Y1, W2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+        batches = [2, 5, 32]
+        heights = [3, 11, 24]
+        widths = [3, 8, 24]
+        test_items = itertools.product(batches, heights, widths)
+        for batch, height, width in test_items:
+            print(f"Test {batch=}, {height=}, {width=}")
+            X_pt = get_random_torch_tensor([batch, 4, height, width], dtype=dtype)
+            W1_pt = get_random_torch_tensor([12, 4, 3, 3], dtype=dtype)
+            W2_pt = get_random_torch_tensor([36, 12, 3, 3], dtype=dtype)
+            Y1_pt = torch.nn.functional.conv2d(X_pt, W1_pt, stride=2, padding=1)
+            Y_pt = torch.nn.functional.conv2d(Y1_pt, W2_pt, stride=2, padding=1)
+            x = X_pt.permute((0, 2, 3, 1)).contiguous()
+            w1 = W1_pt.permute((0, 2, 3, 1)).contiguous()
+            w2 = W2_pt.permute((0, 2, 3, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
+            module.run_with_tensors({"input_0": x, "weight_1": w1, "weight_2": w2}, [y])
+            y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_conv2d_fp16(self):
+        self._test_conv2d_dynamic(
+            test_name="conv2d_dynamic_fp16",
+            dtype="float16",
+        )
+
+    def _test_conv3d_dynamic(
+        self,
+        test_name,
+        dtype="float16",
+    ):
+        target = detect_target()
+        batch_size = [1, 4]
+        d_size = [1, 4]
+        h_size = [3, 224]
+        w_size = [3, 224]
+        stride = (2, 4, 4)
+        pad = (1, 2, 2)
+        channel = 8
+        X = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch"),
+                IntVar(values=d_size, name="input_depth"),
+                IntVar(values=h_size, name="input_height"),
+                IntVar(values=w_size, name="input_width"),
+                channel,
+            ],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W1 = Tensor(
+            shape=[16, 3, 5, 5, channel],
+            dtype=dtype,
+            name="weight_1",
+            is_input=True,
+        )
+        W2 = Tensor(
+            shape=[36, 3, 5, 5, 16],
+            dtype=dtype,
+            name="weight_2",
+            is_input=True,
+        )
+        conv_op1 = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y1 = conv_op1(X, W1)
+        conv_op2 = ops.conv3d(stride=stride, pad=pad, dilate=1)
+        Y = conv_op2(Y1, W2)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y,
+            target,
+            "./tmp",
+            test_name,
+            dynamic_profiling_strategy=DynamicProfileStrategy.HINTS,
+        )
+        depths = [1, 4]
+        heights = [3, 78]
+        widths = [3, 8, 224]
+        test_items = itertools.product(batch_size, depths, heights, widths)
+        for batch, depth, height, width in test_items:
+            print(f"Test {batch=}, {depth=}, {height=}, {width=}")
+            X_pt = get_random_torch_tensor(
+                [batch, channel, depth, height, width], dtype=dtype
+            )
+            W1_pt = get_random_torch_tensor([16, channel, 3, 5, 5], dtype=dtype)
+            W2_pt = get_random_torch_tensor([36, 16, 3, 5, 5], dtype=dtype)
+            Y1_pt = torch.nn.functional.conv3d(X_pt, W1_pt, stride=stride, padding=pad)
+            Y_pt = torch.nn.functional.conv3d(Y1_pt, W2_pt, stride=stride, padding=pad)
+            x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            w1 = W1_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            w2 = W2_pt.permute((0, 2, 3, 4, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 4, 1)).contiguous()
+            module.run_with_tensors({"input_0": x, "weight_1": w1, "weight_2": w2}, [y])
+            y_transpose = y.permute((0, 4, 1, 2, 3))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=0.05, rtol=0.05))
+
+    def test_conv3d_fp16_sm80(self):
+        self._test_conv3d_dynamic(
+            test_name="conv3d_dynamic_fp16",
+            dtype="float16",
+        )
+
+
+filter_test_cases_by_test_env(ConvDynamicTestCase)
+
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_efficient_nms.py b/tests/unittest/ops/test_efficient_nms.py
index 54aaa513c..e5e49921c 100644
--- a/tests/unittest/ops/test_efficient_nms.py
+++ b/tests/unittest/ops/test_efficient_nms.py
@@ -27,6 +27,7 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from torchvision.ops import boxes as box_ops
@@ -61,7 +62,7 @@ def mark_output(y):
         print("output_{} shape: {}".format(i, y_shape))
 
 
-def create_tensors(N):
+def create_tensors(N, dtype="float16"):
     dets = np.array(
         [
             [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
@@ -95,7 +96,7 @@ def create_tensors(N):
             [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
             [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
         ],
-        dtype="float16",
+        dtype=dtype,
     )
     return dets[:N, :4], dets[:N, -1]
 
@@ -108,13 +109,16 @@ def op_gflop(bz, N, max_out):
 
 @skipIfNoTorchVision
 class nmsTestCase(unittest.TestCase):
-    def _create_tensors(self, N, rand=False):
+    def _create_tensors(self, N, rand=False, dtype="float16"):
         if rand:
             boxes = random_boxes(N, 200)
             scores = torch.rand(N)
-            return boxes.numpy().astype("float16"), scores.numpy().astype("float16")
+            return (
+                boxes.numpy().astype(dtype),
+                scores.numpy().astype(dtype),
+            )
         else:
-            boxes, scores = create_tensors(N)
+            boxes, scores = create_tensors(N, dtype=dtype)
             return boxes, scores
 
     def _test_nms(
@@ -133,17 +137,18 @@ def _test_nms(
         test_name="efficient_nms",
         benchmark_shapes=False,
         copy_op=False,
+        dtype="float16",
     ):
         X1 = Tensor(
             shape=[batch_size, N, num_classes, 4],
-            dtype="float16",
+            dtype=dtype,
             name="boxes",
             is_input=True,
         )
 
         X2 = Tensor(
             shape=[batch_size, N, num_classes],
-            dtype="float16",
+            dtype=dtype,
             name="scores",
             is_input=True,
         )
@@ -159,24 +164,24 @@ def _test_nms(
         Y = OP(X1, X2)
         mark_output(Y)
 
-        boxes, scores = self._create_tensors(N, rand=rand_box)
-        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
-        iou = iouThreshold
-        boxes_pt = torch.tensor(boxes).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, rand=rand_box, dtype=dtype)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().to(dtype=torch_dtype)
+        boxes_pt = torch.tensor(boxes).cuda().to(dtype=torch_dtype)
         kept = nonempty(boxes_pt, threshold=minBoxSize)
-        score_pt = torch.tensor(scores).cuda().half()
+        score_pt = torch.tensor(scores).cuda().to(dtype=torch_dtype)
         score_pt[kept] = -1
 
         if bench_pt:
             func = box_ops.batched_nms
-            args = (boxes_pt, score_pt, idxs, iou)
+            args = (boxes_pt, score_pt, idxs, iouThreshold)
             batch_size = 1
             duration = benchmark_torch_function(100, func, *args)
             print(
                 f"PT:  BS: {batch_size}, Time per iter: {duration:.2f}ms, QPS: {batch_size / duration:.2f}"
             )
 
-        keep = box_ops.batched_nms(boxes_pt, score_pt, idxs, iou)
+        keep = box_ops.batched_nms(boxes_pt, score_pt, idxs, iouThreshold)
 
         if keep.shape[0] >= nmsMaxOut:
             keep = keep[:nmsMaxOut]
@@ -186,6 +191,7 @@ def _test_nms(
             ref_box[
                 : keep.shape[0],
             ] = boxes_pt[keep].cpu()
+        ref_box = ref_box.cuda().to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 1, 4)).copy()
         x_scores = scores.reshape((1, N, 1)).copy()
@@ -213,9 +219,9 @@ def _test_nms(
         inputs = {"boxes": x_reshaped, "scores": scores_reshaped}
 
         y0 = torch.empty([batch_size, 1]).cuda().to(torch.int64)
-        y1 = torch.empty([batch_size, nmsMaxOut, 4]).cuda().half()
-        y2 = torch.empty([batch_size, nmsMaxOut]).cuda().half()
-        y3 = torch.empty([batch_size, nmsMaxOut]).cuda().to(torch.int64)
+        y1 = torch.empty([batch_size, nmsMaxOut, 4]).cuda().to(dtype=torch_dtype)
+        y2 = torch.empty([batch_size, nmsMaxOut]).cuda().to(dtype=torch_dtype)
+        y3 = torch.empty([batch_size, nmsMaxOut]).cuda().to(dtype=torch.int64)
         outputs = {"output_0": y0, "output_1": y1, "output_2": y2, "output_3": y3}
         module.run_with_tensors(inputs, outputs)
 
@@ -235,11 +241,9 @@ def _test_nms(
                     torch.allclose(y[idx1, :], y[idx2, :], atol=1e-2, rtol=1e-2)
                 )
         else:
-            self.assertTrue(
-                torch.allclose(y1[0, :], ref_box.cuda().half(), atol=1e-2, rtol=1e-2)
-            )
+            self.assertTrue(torch.allclose(y1[0, :], ref_box, atol=1e-2, rtol=1e-2))
 
-    def test_nms(self):
+    def test_nms_fp16(self):
         # self._test_nms(
         #     N=15000,
         #     preNmsTop=6000,
@@ -273,7 +277,8 @@ def test_nms(self):
             batch_size=2,
             num_classes=4,
             rand_box=False,
-            test_name="nms2",
+            test_name="nms2_fp16",
+            dtype="float16",
         )
         self._test_nms(
             N=30,
@@ -284,40 +289,71 @@ def test_nms(self):
             batch_size=2,
             num_classes=4,
             rand_box=False,
-            test_name="nms2_copy_op",
+            test_name="nms2_copy_op_fp16",
             copy_op=True,
+            dtype="float16",
         )
 
-    @unittest.skip("manually enable it for benchmarking")
-    def test_nms_benchmark_shapes(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    def test_nms_fp32(self):
         self._test_nms(
-            N=3350,
-            preNmsTop=2000,
-            nmsMaxOut=100,
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
             iouThreshold=0.5,
             minBoxSize=0,
-            batch_size=16,
-            num_classes=1,
-            rand_box=True,
-            test_name="nms_fcos_shape",
-            benchmark_shapes=True,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2_fp32",
+            dtype="float32",
+        )
+        self._test_nms(
+            N=30,
+            preNmsTop=30,
+            nmsMaxOut=10,
+            iouThreshold=0.5,
+            minBoxSize=0,
+            batch_size=2,
+            num_classes=4,
+            rand_box=False,
+            test_name="nms2_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
         )
 
-        for bz in (1, 4, 16):
-            for N in (6000, 12000, 20000, 60000):
-                for maxout in (100, 300, 1000):
-                    self._test_nms(
-                        N=N,
-                        preNmsTop=6000,
-                        nmsMaxOut=maxout,
-                        iouThreshold=0.5,
-                        minBoxSize=0,
-                        batch_size=bz,
-                        num_classes=1,
-                        rand_box=True,
-                        test_name="nms_" + str(bz) + "_" + str(N) + "_" + str(maxout),
-                        benchmark_shapes=True,
-                    )
+    # !!! SKIPPED TESTS BELOW !!!
+    # manually enable for benchmarking
+
+    # def test_nms_benchmark_shapes(self):
+    #     self._test_nms(
+    #         N=3350,
+    #         preNmsTop=2000,
+    #         nmsMaxOut=100,
+    #         iouThreshold=0.5,
+    #         minBoxSize=0,
+    #         batch_size=16,
+    #         num_classes=1,
+    #         rand_box=True,
+    #         test_name="nms_fcos_shape",
+    #         benchmark_shapes=True,
+    #     )
+
+    #     for bz in (1, 4, 16):
+    #         for N in (6000, 12000, 20000, 60000):
+    #             for maxout in (100, 300, 1000):
+    #                 self._test_nms(
+    #                     N=N,
+    #                     preNmsTop=6000,
+    #                     nmsMaxOut=maxout,
+    #                     iouThreshold=0.5,
+    #                     minBoxSize=0,
+    #                     batch_size=bz,
+    #                     num_classes=1,
+    #                     rand_box=True,
+    #                     test_name="nms_" + str(bz) + "_" + str(N) + "_" + str(maxout),
+    #                     benchmark_shapes=True,
+    #                 )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_expand.py b/tests/unittest/ops/test_expand.py
index d259ee961..4f6f0639b 100644
--- a/tests/unittest/ops/test_expand.py
+++ b/tests/unittest/ops/test_expand.py
@@ -12,16 +12,20 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import math
+import sys
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model, ops
 
-from aitemplate.compiler.base import IntVar, Tensor
+from aitemplate import compiler
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntImm, IntVar, JaggedDim, Tensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import graph_has_op
+from aitemplate.testing.test_utils import get_random_torch_tensor, graph_has_op, has_op
+from aitemplate.utils import graph_utils
+from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -40,44 +44,98 @@ def test_expand_fails_non_singleton_dim(self):
         expand_shape = [20]
         self.assertRaises(ValueError, ops.expand().__call__, x, expand_shape)
 
-    def test_no_op_expands_removed_static_shapes(self):
-        x = Tensor([1, 2, 3], name="input_0", is_input=True)
+    def _test_no_op_expands_removed_static_shapes(
+        self,
+        test_name="no_op_expands_removed_static_shapes",
+        dtype="float16",
+    ):
+        x = Tensor(
+            [1, 2, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
         y = ops.expand()(x, [1, -1, -1])
         z = ops.elementwise(FuncEnum.MUL)(y, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * x_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_static_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
-    def test_no_op_expands_removed_dynamic_shapes(self):
+    def test_no_op_expands_removed_static_shapes_fp16(self):
+        self._test_no_op_expands_removed_static_shapes(
+            test_name="no_op_expands_removed_static_shapes_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_static_shapes_fp32(self):
+        self._test_no_op_expands_removed_static_shapes(
+            test_name="no_op_expands_removed_static_shapes_fp32",
+            dtype="float32",
+        )
+
+    def _test_no_op_expands_removed_dynamic_shapes(
+        self,
+        test_name="no_op_expands_removed_dynamic_shapes",
+        dtype="float16",
+    ):
         dynamic_dim = IntVar([1, 5], name="dynamic_dim")
-        x = Tensor([1, dynamic_dim, 3], name="input_0", is_input=True)
+        x = Tensor(
+            [1, dynamic_dim, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
         y = ops.expand()(x, [IntVar([1, 1]), -1, -1])
         z = ops.elementwise(FuncEnum.MUL)(y, y)
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * x_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors({"input_0": x_pt}, {"output_0": z_ait})
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
-    def test_no_op_expands_removed_size_op(self):
-        x = Tensor([1, 2, 3], name="input_0", is_input=True)
-        y = Tensor([IntVar([1, 1]), 2, 3], name="input_1", is_input=True)
+    def test_no_op_expands_removed_dynamic_shapes_fp16(self):
+        self._test_no_op_expands_removed_dynamic_shapes(
+            test_name="no_op_expands_removed_dynamic_shapes_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_dynamic_shapes_fp32(self):
+        self._test_no_op_expands_removed_dynamic_shapes(
+            test_name="no_op_expands_removed_dynamic_shapes_fp32",
+            dtype="float32",
+        )
+
+    def _test_no_op_expands_removed_size_op(
+        self,
+        test_name="no_op_expands_removed_size_op",
+        dtype="float16",
+    ):
+        x = Tensor(
+            [1, 2, 3],
+            name="input_0",
+            is_input=True,
+            dtype=dtype,
+        )
+        y = Tensor(
+            [IntVar([1, 1]), 2, 3],
+            name="input_1",
+            is_input=True,
+            dtype=dtype,
+        )
         x_size = ops.size()(x, 0)
         y_size = ops.size()(y, 0)
         x_expand = ops.expand()(x, [x_size, -1, -1])
@@ -86,19 +144,330 @@ def test_no_op_expands_removed_size_op(self):
         z._attrs["is_output"] = True
         z._attrs["name"] = "output_0"
 
-        x_pt = torch.randn((1, 2, 3)).half().cuda()
-        y_pt = torch.randn((1, 2, 3)).half().cuda()
+        x_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
+        y_pt = get_random_torch_tensor([1, 2, 3], dtype=dtype)
         z_pt = x_pt * y_pt
         z_ait = torch.empty_like(z_pt)
-        with compile_model(
-            z, detect_target(), "./tmp", "test_no_op_expands_removed_dynamic_shapes"
-        ) as module:
+        with compile_model(z, detect_target(), "./tmp", test_name) as module:
             module.run_with_tensors(
                 {"input_0": x_pt, "input_1": y_pt}, {"output_0": z_ait}
             )
             self.assertFalse(graph_has_op(module.debug_sorted_graph, "expand"))
             self.assertTrue(torch.equal(z_ait, z_pt))
 
+    def test_no_op_expands_removed_size_op_fp16(self):
+        self._test_no_op_expands_removed_size_op(
+            test_name="no_op_expands_removed_size_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_no_op_expands_removed_size_op_fp32(self):
+        self._test_no_op_expands_removed_size_op(
+            test_name="no_op_expands_removed_size_op_fp32",
+            dtype="float32",
+        )
+
+    def test_no_op_expand_elementwise_jagged_dense_inputs(self):
+        total_length = IntVar([1, 100])
+        batch_dim = IntVar([1, 10])
+        offsets_dim = IntVar([2, 11])
+        embedding_dim = IntImm(128)
+        max_seq_len = 10
+
+        X = Tensor(
+            [batch_dim, 1, embedding_dim],
+            name="x",
+            is_input=True,
+            dtype="float16",
+        )
+        SOURCE = Tensor(
+            [total_length, embedding_dim],
+            name="source",
+            is_input=True,
+            dtype="float16",
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[offsets_dim],
+                name="offsets",
+                is_input=True,
+                dtype="int32",
+            )
+        ]
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(0, max_seq_len),
+            ],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        Y = ops.expand()(X, [batch_dim, max_seq_len, -1])
+        Z = ops.elementwise(FuncEnum.MUL)(JAGGED, Y)
+
+        graph = compiler.transform.toposort([Z])
+        compiler.transform.remove_no_ops(graph)
+        sorted_ops = graph_utils.get_sorted_ops(graph)
+
+        assert not has_op(sorted_ops, "expand")
+
+    @parameterized.expand(
+        [
+            param("fp32_small_noadd_1", "float32", [10, 1, 5], [-1, 10, 5]),
+            param("fp32_small_noadd_2", "float32", [10, 1, 8], [-1, 10, 8]),
+            param("fp32_small_noadd_3", "float32", [10, 1, 2], [-1, 10, 2]),
+            param("fp32_small_noadd_4", "float32", [10, 1, 5], [10, 10, 5]),
+            param("fp32_small_1", "float32", [10, 1, 5], [3, 10, 10, 5]),
+            param("fp32_small_2", "float32", [3, 1, 5], [3, 3, 3, -1]),
+            param("fp32_small_3", "float32", [2, 1, 4, 1, 6], [-1, 10, 4, 5, 6]),
+            param("fp32_small_var_1", "float32", [10, 1, 5], [3, 10, 10, 5], False),
+            param("fp32_small_var_2", "float32", [1, 1, 5], [3, 3, 10, -1], False),
+            param(
+                "fp32_small_var_3", "float32", [2, 1, 4, 1, 6], [-1, 10, 4, 5, 6], False
+            ),
+            param("float16_small_1", "float16", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param("float16_small_2", "float16", [1, 2, 10], [10, 2, 10]),
+            param("bfloat16_small_1", "bfloat16", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param("int64_small_1", "int64", [2, 3, 1, 5], [2, -1, 3, 10, 5]),
+            param(
+                "fp32_large_1",
+                "float32",
+                [100, 1, 9, 3],
+                [2, 20, -1, 100, 9, -1],
+                "int32",
+            ),
+            param(
+                "fp32_large_2",
+                "float32",
+                [101, 1, 91, 3],
+                [-1, 100, 91, -1],
+                "int64",
+            ),
+            param(
+                "fp32_large_3",
+                "float32",
+                [100, 1, 9, 3],
+                [2, 20, -1, 100, 9, -1],
+                "int64",
+            ),
+            # Largest tests commented out, as these lead to GPU OOM failures on Github CircleCI Hardware
+            # param(
+            #    "fp32_large_4",
+            #    "float32",
+            #    [100, 1, 91, 3],
+            #    [2, 20, -1, 100, 91, -1],
+            #    "int64",
+            # ),
+            # param(
+            #     "fp32_large_5",
+            #     "float32",
+            #     [101, 1, 91, 7],
+            #     [3, 21, -1, 103, 91, -1],
+            #     "int64",
+            # ),
+            # param(
+            #     "fp32_large_repeat",
+            #     "float32",
+            #     [101, 1, 91, 8],
+            #     [1000, -1, -1, -1, -1],
+            #     "int64",
+            # ),
+            # param(
+            #     "fp32_large_var_2",
+            #     "float32",
+            #     [100, 1, 9, 3],
+            #     [2, 20, -1, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_1",
+            #     "float32",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_2",
+            #     "int64",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_3",
+            #     "float16",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     True,
+            #     "int64",
+            # ),
+            param(
+                "benchmark_var_1",
+                "float32",
+                [100, 1, 9, 4],
+                [20, 20, 100, 100, 9, -1],
+                False,
+                "int64",
+            ),
+            # param(
+            #     "benchmark_var_2",
+            #     "int64",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            # param(
+            #     "benchmark_var_3",
+            #     "float16",
+            #     [100, 1, 9, 4],
+            #     [20, 20, 100, 100, 9, -1],
+            #     False,
+            #     "int64",
+            # ),
+            param("fp32_m_1", "float32", [5, 1, 3, 2], [2, 2, -1, 5, 3, -1]),
+            param("fp32_m_2", "float32", [5, 1, 3, 5], [2, 2, -1, 5, 3, -1]),
+            param("edge_case_shapes_1", "float32", [1, 1, 1, 1], [1, 1, -1, 1, -1, 1]),
+            param("edge_case_shapes_2", "float32", [1], [-1]),
+            param("edge_case_shapes_3", "float32", [3], [-1]),
+            param("edge_case_shapes_4", "float32", [1], [1]),
+            param("edge_case_shapes_5", "float32", [1, 1], [1, 0]),
+            param("edge_case_shapes_6", "float32", [2, 0], [-1, -1]),
+            param("edge_case_shapes_7", "float32", [2, 0], [2, 0]),
+            param(
+                "edge_case_shapes_var_1",
+                "float32",
+                [1, 1, 1, 1],
+                [1, 1, -1, 1, -1, 1],
+                False,
+            ),
+            param("edge_case_shapes_var_2", "float32", [1], [-1], False),
+            param("edge_case_shapes_var_3", "float32", [3], [-1], False),
+            param("edge_case_shapes_var_4", "float32", [1], [1], False),
+            param("edge_case_shapes_var_5", "float32", [1, 1], [1, 0], False),
+            param("edge_case_shapes_var_6", "float32", [2, 0], [-1, -1], False),
+            param("edge_case_shapes_var_6", "float32", [2, 0], [2, 0], False),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_expand_op(
+        self,
+        name,
+        dtype,
+        src_shape,
+        expand_shape,
+        optimize_fixed_dims=True,
+        index_type="int64",
+    ):
+        x = Tensor(
+            src_shape,
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
+        y = ops.expand()(
+            x,
+            expand_shape,
+            optimize_fixed_dims=optimize_fixed_dims,
+            index_type=index_type,
+        )
+        y._attrs["is_output"] = True
+        y._attrs["name"] = "Y"
+        if dtype != "int64":
+            x_pt = get_random_torch_tensor(src_shape, dtype=dtype)
+        else:
+            x_pt = torch.arange(
+                1, math.prod(src_shape) + 1, 1, dtype=torch.int64, device="cuda"
+            ).view(src_shape)
+        y_pt = x_pt.expand(expand_shape)
+        y_ait = torch.zeros_like(y_pt)
+        stream = torch.cuda.default_stream()
+        start_event_pt = torch.cuda.Event(enable_timing=True)
+        end_event_pt = torch.cuda.Event(enable_timing=True)
+        num_iters = 20
+        with compile_model(
+            y, detect_target(), "./tmp", "test_expand_codegen_" + name
+        ) as module:
+            module.run_with_tensors({"X": x_pt}, {"Y": y_ait})
+            self.assertTrue(graph_has_op(module.debug_sorted_graph, "expand"))
+            time_mean_ms, time_std_ms, result_tensors = module.benchmark_with_tensors(
+                {"X": x_pt}, {"Y": y_ait}, count=num_iters
+            )
+        print(
+            f"Write GB/sec:{1000*y_pt.numel()*y_pt.element_size()/time_mean_ms/(1024*1024*1024)}"
+        )
+        self.assertTrue(torch.equal(y_ait, y_pt))
+        # measure time against torch.contiguous()
+        cache_trasher = torch.zeros(1000, 1000, 42, device="cuda", requires_grad=False)
+        sum_elapsed_pt = 0.0
+        for _ in range(num_iters):
+            # trash the L2 cache, just like the benchmark code of AIT does
+            cache_trasher.normal_()
+            start_event_pt = torch.cuda.Event(enable_timing=True)
+            end_event_pt = torch.cuda.Event(enable_timing=True)
+            torch.cuda.synchronize()
+            start_event_pt.record(stream=stream)
+            _ = y_pt.contiguous()
+            end_event_pt.record(stream=stream)
+            torch.cuda.synchronize()
+            sum_elapsed_pt += start_event_pt.elapsed_time(end_event_pt)
+
+        pt_time = sum_elapsed_pt / num_iters
+        ait_throughput_write = (
+            1000
+            * y_pt.numel()
+            * y_pt.element_size()
+            / time_mean_ms
+            / (1024 * 1024 * 1024)
+        )
+        ait_throughput_read_once = (
+            1000
+            * x_pt.numel()
+            * x_pt.element_size()
+            / time_mean_ms
+            / (1024 * 1024 * 1024)
+        )
+        ait_throughput_total_lower_bound = (
+            ait_throughput_write + ait_throughput_read_once
+        )  # Assuming we just read the input once
+        ait_throughput_total_upper_bound = (
+            ait_throughput_write * 2
+        )  # Assuming every byte written has been read as well
+
+        pt_throughput_write = (
+            1000 * y_pt.numel() * y_pt.element_size() / pt_time / (1024 * 1024 * 1024)
+        )
+        pt_throughput_read = (  # Assuming we just read the input once
+            1000 * x_pt.numel() * x_pt.element_size() / pt_time / (1024 * 1024 * 1024)
+        )
+
+        pt_throughput_total_lower_bound = (
+            pt_throughput_write + pt_throughput_read
+        )  # Assuming we just read the input once
+        pt_throughput_total_upper_bound = (
+            pt_throughput_write * 2
+        )  # Assuming every byte written has been read as well
+
+        # ait_speedup_percent = round(100.0 * pt_time / time_mean_ms - 100.0)
+        ait_speedup_factor = f"{pt_time/time_mean_ms:.2f}"
+        ait_expand_variant = "general"
+        if optimize_fixed_dims:
+            ait_expand_variant = "optimized"
+        print(
+            f"""Benchmark Summary (test_expand_op:{name}) - {src_shape} => {expand_shape}: dtype={dtype}, variant={ait_expand_variant}. AIT speedup={ait_speedup_factor}x. Throughputs in GB/sec.: Write: pt={pt_throughput_write:.1f}, ait={ait_throughput_write:.1f}, Total (lower): pt={pt_throughput_total_lower_bound:.1f}, ait={ait_throughput_total_lower_bound:.1f} Total (upper): pt={pt_throughput_total_upper_bound:.1f}, ait=={ait_throughput_total_upper_bound:.1f} ]
+Benchmark note: Total throughput (lower) assumes the input is read once, Total throughput (upper) assumes every byte written has been read as well. The truth is inbetween due to caching of repeated reads.""",
+            file=sys.stdout,
+            flush=True,
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_flatten.py b/tests/unittest/ops/test_flatten.py
index 2a7057d3e..d63261e0c 100644
--- a/tests/unittest/ops/test_flatten.py
+++ b/tests/unittest/ops/test_flatten.py
@@ -16,22 +16,28 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model
 from aitemplate.compiler.base import IntImm, IntVar
-
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FlattenTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_single_op(
         self,
         X_shape,
         start_dim=0,
         end_dim=-1,
         test_name="flatten",
         check_name_retention=False,
+        dtype="float16",
     ):
         target = detect_target()
         dynamic_dim_names = [
@@ -41,7 +47,7 @@ def _test_fp16_single_op(
         X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
         X = Tensor(
             shape=X_shape,
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -52,13 +58,14 @@ def _test_fp16_single_op(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         x_shape_values = [var._attrs["values"] for var in X_shape]
         x_shapes = itertools.product(*x_shape_values)
 
         for x_shape in x_shapes:
-            X_pt = torch.randn(x_shape).cuda().half()
+            X_pt = get_random_torch_tensor(x_shape, dtype=dtype)
             Y_pt = torch.flatten(X_pt, start_dim, end_dim)
             y = torch.empty_like(Y_pt)
             in_x = X_pt.clone()
@@ -73,63 +80,165 @@ def _test_fp16_single_op(
                     )
                 )
 
-    def test_flatten(self):
-        self._test_fp16_single_op(
-            X_shape=(IntVar(values=[1, 3]), 16, 32, 64), test_name="flatten0"
+    def test_flatten_fp16(self):
+        self._test_single_op(
+            X_shape=(IntVar(values=[1, 3]), 16, 32, 64),
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
             start_dim=0,
             end_dim=1,
-            test_name="flatten1",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[2, 5]), 16, 32, 64),
             start_dim=0,
             end_dim=0,
-            test_name="flatten2",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[3, 4]), 16, 32, 64),
             start_dim=1,
             end_dim=-2,
-            test_name="flatten3",
+            test_name="flatten_fp16",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(IntVar(values=[3, 4], name="input_batch"), 16, 32, 2, 64),
             start_dim=1,
             end_dim=-2,
-            test_name="flatten_name",
+            test_name="flatten_fp16_name",
             check_name_retention=True,
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(16, 32, IntVar(values=[3, 4], name="input_batch"), 2, 64),
             start_dim=1,
             end_dim=-1,
-            test_name="flatten_dynamic_nonbatch",
+            test_name="flatten_fp16_dynamic_nonbatch",
+            dtype="float16",
         )
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 16, 4, IntVar(values=[3, 4], name="input_batch"), 16),
             start_dim=0,
             end_dim=2,
-            test_name="flatten_dynamic_nonbatch_name",
+            test_name="flatten_fp16_dynamic_nonbatch_name",
             check_name_retention=True,
+            dtype="float16",
         )
-
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 16, 4, 3, 16),
             start_dim=0,
             end_dim=2,
-            test_name="flatten_static_1",
+            test_name="flatten_fp16_static",
+            dtype="float16",
         )
-
-        self._test_fp16_single_op(
+        self._test_single_op(
             X_shape=(32, 3, 16, 4, 16),
             start_dim=0,
             end_dim=-1,
-            test_name="flatten_static_2",
+            test_name="flatten_fp16_static",
+            dtype="float16",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_flatten_fp32(self):
+        self._test_single_op(
+            X_shape=(IntVar(values=[1, 3]), 16, 32, 64),
+            test_name="flatten_fp32",
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(IntVar(values=[3, 4], name="input_batch"), 16, 32, 2, 64),
+            start_dim=1,
+            end_dim=-2,
+            test_name="flatten_fp32_name",
+            check_name_retention=True,
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(16, 32, IntVar(values=[3, 4], name="input_batch"), 2, 64),
+            start_dim=1,
+            end_dim=-1,
+            test_name="flatten_fp32_dynamic_nonbatch",
+            dtype="float32",
+        )
+        self._test_single_op(
+            X_shape=(32, 16, 4, 3, 16),
+            start_dim=0,
+            end_dim=2,
+            test_name="flatten_fp32_static",
+            dtype="float32",
+        )
+
+    def _test_flatten_shape(self, in_shape, out_shape, start_dim, end_dim):
+        X = Tensor(
+            shape=in_shape,
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Flatten(start_dim, end_dim)
+        Y = OP(X)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_flatten_shape_imm(self):
+        in_shape = [IntImm(17), IntImm(19), IntImm(23)]
+
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19 * 23)], 0, 2)
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19 * 23)], 0, -1)
+        self._test_flatten_shape(in_shape, [IntImm(17 * 19), IntImm(23)], 0, 1)
+        self._test_flatten_shape(in_shape, [IntImm(17), IntImm(19 * 23)], 1, 2)
+        self._test_flatten_shape(in_shape, [IntImm(17), IntImm(19 * 23)], 1, -1)
+
+    def test_flatten_shape_var(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+        in_shape = [var1, var2, var3]
+
+        ovar1 = IntVar(values=[21, 110], symbolic_value=sym1 * sym2 * sym3)
+        self._test_flatten_shape(in_shape, [ovar1], 0, 2)
+        self._test_flatten_shape(in_shape, [ovar1], 0, -1)
+        ovar1 = IntVar(values=[3, 10], symbolic_value=sym1 * sym2)
+        self._test_flatten_shape(in_shape, [ovar1, var3], 0, 1)
+        ovar1 = IntVar(values=[21, 55], symbolic_value=sym2 * sym3)
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, 2)
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
+
+    def test_flatten_shape_mix(self):
+        var1 = IntVar(values=[1, 2], name="var1")
+        var2 = IntVar(values=[3, 5], name="var2")
+        var3 = IntVar(values=[7, 11], name="var3")
+        imm1 = IntImm(17)
+        imm2 = IntImm(19)
+        sym1 = var1._attrs["symbolic_value"]
+        sym2 = var2._attrs["symbolic_value"]
+        sym3 = var3._attrs["symbolic_value"]
+        in_shape = [var1, imm1, var2, var3, imm2]
+
+        ovar1 = IntVar(values=[51, 170])
+        ovar1._attrs["symbolic_value"] = sym1 * 17 * sym2
+        self._test_flatten_shape(in_shape, [ovar1, var3, imm2], 0, 2)
+        ovar1 = IntVar(values=[6783, 35530], symbolic_value=323 * sym1 * sym2 * sym3)
+        self._test_flatten_shape(in_shape, [ovar1], 0, -1)
+        ovar1 = IntVar(values=[357, 935], symbolic_value=17 * sym2 * sym3)
+        self._test_flatten_shape(in_shape, [var1, ovar1, imm2], 1, 3)
+        ovar1 = IntVar(values=[6783, 17765], symbolic_value=323 * sym2 * sym3)
+        self._test_flatten_shape(in_shape, [var1, ovar1], 1, -1)
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_fpn_roi_align.py b/tests/unittest/ops/test_fpn_roi_align.py
index ff5f5b337..6a6461fbd 100644
--- a/tests/unittest/ops/test_fpn_roi_align.py
+++ b/tests/unittest/ops/test_fpn_roi_align.py
@@ -20,6 +20,7 @@
 from aitemplate.compiler import compile_model, Model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from detectron2.modeling.poolers import ROIPooler
@@ -30,15 +31,20 @@
 skipIfNoD2 = skipIf(not HAS_D2, "no detectron2")
 
 
-def random_boxes(num_boxes, max_coord=512):
+def random_boxes(num_boxes, max_coord=512, dtype="float16"):
     boxes = torch.rand(num_boxes, 4) * (max_coord * 0.5)
     boxes.clamp_(min=1.0)
     boxes[:, 2:] += boxes[:, :2]
-    return boxes.cuda().half()
+    torch_dtype = string_to_torch_dtype(dtype)
+    return boxes.cuda().to(dtype=torch_dtype)
 
 
 @skipIfNoD2
 class RoiAlignTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        torch.manual_seed(0)
+
     def _test_fpn_roi_align(
         self,
         boxes,
@@ -54,24 +60,26 @@ def _test_fpn_roi_align(
         rebuild=True,
         bench=False,
         copy_op=False,
+        dtype="float16",
+        eps=1e-2,
     ):
         HH, WW = im_shape
         target = detect_target()
 
         P2 = Tensor(
-            shape=[1, HH // 4, WW // 4, CC], dtype="float16", name="P2", is_input=True
+            shape=[1, HH // 4, WW // 4, CC], dtype=dtype, name="P2", is_input=True
         )
 
         P3 = Tensor(
-            shape=[1, HH // 8, WW // 8, CC], dtype="float16", name="P3", is_input=True
+            shape=[1, HH // 8, WW // 8, CC], dtype=dtype, name="P3", is_input=True
         )
         P4 = Tensor(
-            shape=[1, HH // 16, WW // 16, CC], dtype="float16", name="P4", is_input=True
+            shape=[1, HH // 16, WW // 16, CC], dtype=dtype, name="P4", is_input=True
         )
         P5 = Tensor(
-            shape=[1, HH // 32, WW // 32, CC], dtype="float16", name="P5", is_input=True
+            shape=[1, HH // 32, WW // 32, CC], dtype=dtype, name="P5", is_input=True
         )
-        R = Tensor(shape=[num_rois, 5], dtype="float16", name="ROI", is_input=True)
+        R = Tensor(shape=[num_rois, 5], dtype=dtype, name="ROI", is_input=True)
 
         OP = ops.multi_level_roi_align(
             num_rois=num_rois,
@@ -99,7 +107,7 @@ def fpn_roialign_pt(boxes, features, device="cuda"):
 
             pooler_resolution = pooled_size
             canonical_level = 4
-            canonical_scale_factor = 2 ** canonical_level
+            canonical_scale_factor = 2**canonical_level
             pooler_scales = (
                 4.0 / canonical_scale_factor,
                 2.0 / canonical_scale_factor,
@@ -131,16 +139,15 @@ def fpn_roialign_pt(boxes, features, device="cuda"):
 
         rois = torch.zeros(num_rois, 5)
         rois[:, 1:] = boxes
-        rois = rois.cuda().half()
-        X_p2 = features[0].half()
-        X_p3 = features[1].half()
-        X_p4 = features[2].half()
-        X_p5 = features[3].half()
+        rois = rois.cuda()
+
+        torch_dtype = string_to_torch_dtype(dtype)
+        rois = rois.to(dtype=torch_dtype)
+        features = [f.to(dtype=torch_dtype) for f in features]
 
-        x_p2 = X_p2.permute((0, 2, 3, 1)).contiguous()
-        x_p3 = X_p3.permute((0, 2, 3, 1)).contiguous()
-        x_p4 = X_p4.permute((0, 2, 3, 1)).contiguous()
-        x_p5 = X_p5.permute((0, 2, 3, 1)).contiguous()
+        x_p2, x_p3, x_p4, x_p5 = [
+            f.permute((0, 2, 3, 1)).contiguous() for f in features
+        ]
 
         inputs = {
             "P2": x_p2,
@@ -149,13 +156,14 @@ def fpn_roialign_pt(boxes, features, device="cuda"):
             "P5": x_p5,
             "ROI": rois,
         }
-        y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
+        y = torch.empty_like(y_pt).permute((0, 2, 3, 1)).contiguous()
+        y = y.to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        eps = 1e-2
-        self.assertTrue(torch.allclose(y_pt.half(), y_transpose, atol=eps, rtol=eps))
+        y_transpose = y_transpose.to(dtype=y_pt.dtype)
+        self.assertTrue(torch.allclose(y_pt, y_transpose, atol=eps, rtol=eps))
 
-    def test_fpn_roi_align(self):
+    def _runner(self, dtype="float16", eps=1e-2):
         N, C, H, W = 1, 16, 512, 512
         std = 11
         mean = 0
@@ -172,21 +180,7 @@ def test_fpn_roi_align(self):
             feature5.cuda(),
         ]
 
-        boxes = torch.tensor(
-            [
-                [100.0, 120.0, 152.0, 152.0],
-                [2.0, 2.0, 52.0, 52.0],
-                [1.0, 1.0, 100.0, 100.0],
-                [110.0, 110.0, 300.0, 300.0],
-                [1.0, 1.0, 150.0, 150.0],
-                [10.0, 10.0, 300.0, 300.0],
-                [10.0, 10.0, 400.0, 400.0],
-                [110.0, 110.0, 400.0, 400.0],
-                [110.0, 110.0, 350.0, 350.0],
-                [10.0, 10.0, 510.0, 510.0],
-            ]
-        ).cuda()
-        boxes = random_boxes(100)
+        boxes = random_boxes(100, dtype=dtype)
         self._test_fpn_roi_align(
             boxes,
             features,
@@ -195,7 +189,9 @@ def test_fpn_roi_align(self):
             im_shape=(H, W),
             pooled_size=7,
             rebuild=1,
-            test_name="fpn_roi_align",
+            test_name=f"fpn_roi_align_{dtype}",
+            dtype=dtype,
+            eps=eps,
         )
         self._test_fpn_roi_align(
             boxes,
@@ -205,11 +201,18 @@ def test_fpn_roi_align(self):
             im_shape=(H, W),
             pooled_size=7,
             rebuild=1,
-            test_name="fpn_roi_align_copy_op",
+            test_name=f"fpn_roi_align_copy_op_{dtype}",
             copy_op=True,
+            dtype=dtype,
+            eps=eps,
         )
 
+    def test_fpn_roi_align_fp16(self):
+        self._runner(dtype="float16", eps=1e-1)
+
+    def test_fpn_roi_align_fp32(self):
+        self._runner(dtype="float32", eps=1e-2)
+
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_full.py b/tests/unittest/ops/test_full.py
new file mode 100644
index 000000000..7759410bc
--- /dev/null
+++ b/tests/unittest/ops/test_full.py
@@ -0,0 +1,145 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import param, parameterized
+
+
+class TestFull(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_full(
+        self,
+        shape,
+        fill_value,
+        dtype="float16",
+        test_name="full",
+    ) -> None:
+        Y = ops.full()(shape, fill_value, dtype)
+        Y._attrs["name"] = "Y"
+
+        if not isinstance(shape, list):
+            shape = [shape]
+
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Z = ops.elementwise(FuncEnum.ADD)(X, Y)
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Z, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        if isinstance(shape[0], IntVar):
+            shapes = [[val] + shape[1:] for val in shape[0]._attrs["values"]]
+        else:
+            shapes = [shape]
+
+        for shape in shapes:
+            x_pt = get_random_torch_tensor(shape, dtype=dtype)
+            z_pt = x_pt + fill_value
+
+            z = torch.empty_like(z_pt)
+
+            module.run_with_tensors([x_pt], [z])
+            torch.testing.assert_close(z, z_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param(1, [1], 1, "float16"),
+                    param(2, [10, 20, 30], 3.14, "float16"),
+                    param(3, [IntVar([10, 20]), 30], 0, "float16"),
+                    param(4, 123, -5, "float16"),
+                    param(5, [20, 30], 2.71, "float32"),
+                    param(6, [IntVar([1, 128]), 10], -1.23, "float32"),
+                    param(7, IntVar([1, 128]), 1234, "float32"),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param(8, [20, 30], 2.71, "bfloat16"),
+                    param(9, [IntVar([1, 128]), 10], -1.23, "bfloat16"),
+                    param(10, IntVar([1, 128]), 1234, "bfloat16"),
+                ],
+            }
+        )
+    )
+    def test_full(self, i, shape, fill_value, dtype):
+        self._test_full(
+            shape=shape,
+            fill_value=fill_value,
+            dtype=dtype,
+            test_name=f"test_full_{i}",
+        )
+
+    def test_const_full_with_copy(self, dtype="float16"):
+        shape = [IntVar([1, 128]), 10]
+        full = ops.full()(shape, 1.0, dtype)
+        add = ops.elementwise(FuncEnum.ADD)(full, 1.0)
+        Y = ops.flatten()(add)
+
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        flattened_X = ops.flatten()(X)
+        Z = ops.elementwise(FuncEnum.ADD)(flattened_X, Y)
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Z, target, "./tmp", "const_full_with_copy")
+
+        if isinstance(shape[0], IntVar):
+            shapes = [[val] + shape[1:] for val in shape[0]._attrs["values"]]
+        else:
+            shapes = [shape]
+
+        for shape in shapes:
+            x_pt = get_random_torch_tensor(shape, dtype=dtype)
+            tmp_pt = x_pt + 1.0 + 1.0
+            z_pt = torch.flatten(tmp_pt)
+
+            z = torch.empty_like(z_pt)
+
+            module.run_with_tensors([x_pt], [z])
+            torch.testing.assert_close(z, z_pt, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_fused_elementwise.py b/tests/unittest/ops/test_fused_elementwise.py
index f611ecf1b..994eaf28f 100644
--- a/tests/unittest/ops/test_fused_elementwise.py
+++ b/tests/unittest/ops/test_fused_elementwise.py
@@ -15,25 +15,44 @@
 """
 Unittests for fused_elementwise Operator.
 """
-import math
+
+import itertools
 import unittest
 from typing import List
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops, transform
 from aitemplate.compiler.base import IntImm
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.compiler.stable_set import StableSet
+from aitemplate.compiler.transform.fuse_ops import _get_inputs_outputs
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    get_torch_full_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
+
 
-ait_dtype_to_pytorch = {"float16": torch.float16}
+_AIT_DTYPE_TO_PYTORCH_DTYPE = {
+    "float16": torch.float16,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
 
 
 class FusedElementwiseTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_fused_elementwise_constructor(self, ait_dtype):
         BATCH_SIZE = 1024
         M = 256
@@ -65,8 +84,17 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
         transform.name_graph(graph)
         transform.mark_param_tensor(graph)
         transform.refine_graph(graph)
-
-        fused_op = ops.fused_elementwise([op1, op2])
+        inputs, outputs, external_inputs, external_outputs = _get_inputs_outputs(
+            {op1, op2}, {op1, op2}
+        )
+        for tensor in inputs | outputs:
+            tensor._attrs["src_ops"] = tensor._attrs["src_ops"] - {op1, op2}
+            tensor._attrs["dst_ops"] = tensor._attrs["dst_ops"] - {op1, op2}
+        fused_op = ops.fused_elementwise(
+            [op1, op2],
+            external_inputs,
+            external_outputs,
+        )
         fused_op._attrs["name"] = "fused_elementwise0"
 
         self.assertEqual(fused_op._attrs["inputs"], [X1])
@@ -79,12 +107,28 @@ def _test_fused_elementwise_constructor(self, ait_dtype):
         self.assertEqual(X1._attrs["depth"], 0)
         self.assertEqual(X4._attrs["depth"], 2)
 
-    def test_fused_elementwise_constructor(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_constructor(ait_dtype)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_constructor(self, ait_dtype):
+        self._test_fused_elementwise_constructor(ait_dtype)
 
-    def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype):
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+    def _test_fused_elementwise_e2e(
+        self,
+        batch_sizes,
+        ms,
+        ks,
+        test_name,
+        ait_dtype,
+        use_fp32_acc=False,
+    ):
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(batch_sizes),
@@ -107,12 +151,12 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype)
         X4._attrs["name"] = "output0"
         X4._attrs["is_output"] = True
 
-        target = detect_target()
+        target = detect_target(elementwise_use_fp32_acc=use_fp32_acc)
         module = compile_model(
             X4,
             target,
             "./tmp",
-            "fused_elementwise_{}".format(test_name),
+            f"fused_elementwise_{test_name}_{use_fp32_acc}",
         )
 
         for batch_size in batch_sizes:
@@ -125,48 +169,61 @@ def _test_fused_elementwise_e2e(self, batch_sizes, ms, ks, test_name, ait_dtype)
                     module.run_with_tensors([x1_pt], [x4])
                     self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_fused_elementwise_e2e(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[256],
-                ks=[128],
-                test_name=f"static_shapes_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1, 99, 998, 1024],
-                ms=[256],
-                ks=[128],
-                test_name=f"dynamic_batch_size_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[1, 128, 256],
-                ks=[128],
-                test_name=f"dynamic_m_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
-            self._test_fused_elementwise_e2e(
-                batch_sizes=[1024],
-                ms=[256],
-                ks=[1, 3, 8, 128],
-                test_name=f"dynamic_k_{ait_dtype}",
-                ait_dtype=ait_dtype,
-            )
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_e2e(self, ait_dtype):
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[128],
+            test_name=f"static_shapes_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1, 99, 998, 1024],
+            ms=[256],
+            ks=[128],
+            test_name=f"dynamic_batch_size_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[1, 128, 256],
+            ks=[128],
+            test_name=f"dynamic_m_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        self._test_fused_elementwise_e2e(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[1, 3, 8, 128],
+            test_name=f"dynamic_k_{ait_dtype}",
+            ait_dtype=ait_dtype,
+        )
+        for use_fp32_acc in (False, True):
             self._test_fused_elementwise_e2e(
                 batch_sizes=[700, 80, 1024],
                 ms=[23, 78, 256],
                 ks=[10, 30, 128],
                 test_name=f"dynamic_all_{ait_dtype}",
                 ait_dtype=ait_dtype,
+                use_fp32_acc=use_fp32_acc,
             )
 
-    def _test_fused_elementwise_kernel1(self, ait_dtype):
+    def _test_fused_elementwise_kernel1(
+        self,
+        ait_dtype,
+        use_fp32_acc=False,
+    ):
         BATCH_SIZE = 1024
         M = 1496
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(BATCH_SIZE), IntImm(2), IntImm(M)],
             dtype=ait_dtype,
@@ -177,7 +234,7 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
             shape=[],
             dtype=ait_dtype,
             name="constant_number",
-            value=1.0,
+            value=2.0,
         )
         X3 = Tensor(
             shape=[IntImm(2), IntImm(M)],
@@ -194,26 +251,41 @@ def _test_fused_elementwise_kernel1(self, ait_dtype):
         X9._attrs["is_output"] = True
         X9._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(elementwise_use_fp32_acc=use_fp32_acc)
         module = compile_model(
-            X9, target, "./tmp", f"fused_elementwise_kernel1_{ait_dtype}"
+            X9,
+            target,
+            "./tmp",
+            f"fused_elementwise_kernel1_{ait_dtype}_{use_fp32_acc}",
         )
 
-        x1_pt = torch.randn(BATCH_SIZE, 2, M).cuda().to(dtype=torch_dtype)
-        x3_pt = torch.randn(2, M).cuda().to(dtype=torch_dtype)
-        x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt)) * x3_pt
+        x1_pt = get_random_torch_tensor((BATCH_SIZE, 2, M), ait_dtype)
+        x3_pt = get_random_torch_tensor((2, M), ait_dtype)
+        x9_pt = torch.sign(x1_pt) * torch.log1p(torch.abs(x1_pt) + 1) * x3_pt
 
         inputs = {"input0": x1_pt, "constant_matrix": x3_pt}
-        x9 = torch.empty([BATCH_SIZE, 2, M]).cuda().to(dtype=torch_dtype)
+        x9 = get_torch_empty_tensor([BATCH_SIZE, 2, M], ait_dtype)
         module.run_with_tensors(inputs, [x9])
-        self.assertTrue(torch.allclose(x9, x9_pt, atol=1e-2, rtol=1e-2))
-
-    def test_fused_elementwise_kernel1(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_fused_elementwise_kernel1(ait_dtype)
+        torch.testing.assert_close(x9, x9_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_fused_elementwise_kernel1(self, ait_dtype):
+        for use_fp32_acc in (False, True):
+            self._test_fused_elementwise_kernel1(
+                ait_dtype=ait_dtype,
+                use_fp32_acc=use_fp32_acc,
+            )
 
-    def _test_sigmoid(self, input_size, test_name, ait_dtype):
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+    def _test_sigmoid(self, input_size, test_name, ait_dtype, use_fast_math=True):
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -224,25 +296,48 @@ def _test_sigmoid(self, input_size, test_name, ait_dtype):
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(use_fast_math=use_fast_math)
         module = compile_model(X2, target, "./tmp", test_name)
 
-        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x1_pt = (
+            (torch.rand(input_size, device="cuda", dtype=torch_dtype) - 0.5) * 2.0
+        ) * torch.finfo(torch_dtype).max
         x2_pt = torch.sigmoid(x1_pt)
 
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        x2 = torch.empty_like(x2_pt)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
-
-    def test_sigmoid(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
-            self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
-            self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
+        if use_fast_math:
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.equal(x2, x2_pt), f"{x2=}\n{x2_pt=}")
+        # sanity checks
+        self.assertEqual(torch.sum(x2 < 0), 0)
+        self.assertEqual(torch.sum(x2 > 1), 0)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_sigmoid(self, ait_dtype):
+        self._test_sigmoid([1024, 2 * 1496], f"sigmoid_1_{ait_dtype}", ait_dtype)
+        self._test_sigmoid([1024, 23744], f"sigmoid_2_{ait_dtype}", ait_dtype)
+        self._test_sigmoid([1024, 70144], f"sigmoid_3_{ait_dtype}", ait_dtype)
+        # use_fast_math = False
+        self._test_sigmoid(
+            [1024, 70144],
+            f"sigmoid_no_fast_math_{ait_dtype}",
+            ait_dtype,
+            use_fast_math=False,
+        )
 
-    def _test_tanh(self, input_size, test_name, ait_dtype):
+    def _test_tanh(self, input_size, test_name, ait_dtype, use_fast_math=True):
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -253,7 +348,7 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
         X2._attrs["is_output"] = True
         X2._attrs["name"] = "output0"
 
-        target = detect_target()
+        target = detect_target(use_fast_math=use_fast_math)
         module = compile_model(X2, target, "./tmp", test_name)
 
         x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
@@ -261,17 +356,36 @@ def _test_tanh(self, input_size, test_name, ait_dtype):
 
         x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [x2])
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
-
-    def test_tanh(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
-            self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
-            self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
+        if use_fast_math:
+            self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.equal(x2, x2_pt))
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                # float16 device function is different for SM80 and lower
+                TestEnv.CUDA_SM80: [("float16"), ("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_tanh(self, ait_dtype):
+        self._test_tanh([1024, 22400], f"tanh_1_{ait_dtype}", ait_dtype)
+        self._test_tanh([1024, 70144], f"tanh_2_{ait_dtype}", ait_dtype)
+        self._test_tanh([1024, 23744], f"tanh_3_{ait_dtype}", ait_dtype)
+        # use_fast_math = False
+        self._test_tanh(
+            [1024, 23744],
+            f"tanh_no_fast_math_{ait_dtype}",
+            ait_dtype,
+            use_fast_math=False,
+        )
 
     def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -295,64 +409,18 @@ def _test_gelu(self, input_size, test_name, ait_dtype, fast_gelu=False):
         module.run_with_tensors([x1_pt], [x2])
         self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2))
 
-    def test_gelu(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_gelu([1024, 22400], f"gelu_1_{ait_dtype}", ait_dtype)
-            self._test_gelu([1024, 70144], f"fast_gelu_1_{ait_dtype}", ait_dtype, True)
-
-    def _test_power(self, input_size, exp, test_name, ait_dtype):
-        print(f"Running test {test_name} with exp = {exp}")
-        assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
-        X1 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
-            dtype=ait_dtype,
-            name="input0",
-            is_input=True,
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-        X2 = ops.elementwise(FuncEnum.POW)(X1, exp)
-        X2._attrs["is_output"] = True
-        X2._attrs["name"] = "output0"
-
-        target = detect_target()
-        module = compile_model(X2, target, "./tmp", test_name)
-
-        if abs(exp) < 1.0:
-            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype) + 0.5
-        else:
-            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        x2_pt = torch.pow(x1_pt, exp)
-
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
-        module.run_with_tensors([x1_pt], [x2])
-        # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
-        # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
-        # print(f"BW: {bw} GB/s")
-        self.assertTrue(torch.allclose(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True))
-
-    def test_power(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            for i, exp in enumerate(
-                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
-            ):
-                input_sizes = [1024, 22400]
-                self._test_power(
-                    input_sizes,
-                    exp,
-                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
-                    ait_dtype,
-                )
-
-            for i, exp in enumerate(
-                [0.0, 1.0, 2.0, 3.0, -2.0, 0.5, -0.5, -1.0, 2.5, -2.5]
-            ):
-                input_sizes = [1025, 22401]
-                self._test_power(
-                    input_sizes,
-                    exp,
-                    f"pow_{input_sizes[0]}_{input_sizes[1]}_{i}_{ait_dtype}",
-                    ait_dtype,
-                )
+    )
+    def test_gelu(self, ait_dtype):
+        self._test_gelu([1024, 22400], f"gelu_1_{ait_dtype}", ait_dtype)
+        self._test_gelu([1024, 70144], f"fast_gelu_1_{ait_dtype}", ait_dtype, True)
 
     def _test_min_max(
         self,
@@ -363,7 +431,6 @@ def _test_min_max(
         ait_dtype,
     ) -> None:
         assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
         X0 = Tensor(
             shape=[IntImm(input_size[0]), IntImm(input_size[1])],
             dtype=ait_dtype,
@@ -387,8 +454,8 @@ def _test_min_max(
         target = detect_target()
         module = compile_model(result, target, "./tmp", test_name)
 
-        x0_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x0_pt = get_random_torch_tensor(input_size, ait_dtype)
+        x1_pt = get_random_torch_tensor(input_size, ait_dtype)
         if add_nans:
             x1_pt[0].fill_(float("nan"))
 
@@ -396,53 +463,67 @@ def _test_min_max(
             x2_pt = torch.min(x0_pt, x1_pt)
         else:
             x2_pt = torch.max(x0_pt, x1_pt)
-        x2_np = x2_pt.cpu().numpy()
 
         inputs = {"input0": x0_pt, "input1": x1_pt}
-        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        x2 = get_torch_empty_tensor(input_size, ait_dtype)
         module.run_with_tensors(inputs, [x2])
-        x2 = x2.cpu().numpy()
 
         if add_nans:
-            nans = np.full(x2_np[0].shape, np.nan)
-            np.testing.assert_allclose(nans, x2_np[0], equal_nan=True)
-            np.testing.assert_allclose(nans, x2[0], equal_nan=True)
-
-        np.testing.assert_allclose(x2, x2_np, atol=1e-2, rtol=1e-2)
-
-    def test_min(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_min_max(
-                [512, 512],
-                test_name=f"min_nonan_{ait_dtype}",
-                is_min=True,
-                add_nans=False,
-                ait_dtype=ait_dtype,
-            )
-            self._test_min_max(
-                [512, 512],
-                test_name=f"min_nan_{ait_dtype}",
-                is_min=True,
-                add_nans=True,
-                ait_dtype=ait_dtype,
-            )
+            nans = get_torch_full_tensor(x2_pt[0].shape, float("nan"), ait_dtype)
+            torch.testing.assert_close(nans, x2_pt[0], equal_nan=True)
+            torch.testing.assert_close(nans, x2[0], equal_nan=True)
+
+        torch.testing.assert_close(x2, x2_pt, atol=1e-2, rtol=1e-2, equal_nan=True)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_min(self, ait_dtype):
+        self._test_min_max(
+            [512, 512],
+            test_name=f"min_nonan_{ait_dtype}",
+            is_min=True,
+            add_nans=False,
+            ait_dtype=ait_dtype,
+        )
+        self._test_min_max(
+            [512, 512],
+            test_name=f"min_nan_{ait_dtype}",
+            is_min=True,
+            add_nans=True,
+            ait_dtype=ait_dtype,
+        )
 
-    def test_max(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_min_max(
-                [512, 512],
-                test_name=f"max_nonan_{ait_dtype}",
-                is_min=False,
-                add_nans=False,
-                ait_dtype=ait_dtype,
-            )
-            self._test_min_max(
-                [512, 512],
-                test_name=f"max_nan_{ait_dtype}",
-                is_min=False,
-                add_nans=True,
-                ait_dtype=ait_dtype,
-            )
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_max(self, ait_dtype):
+        self._test_min_max(
+            [512, 512],
+            test_name=f"max_nonan_{ait_dtype}",
+            is_min=False,
+            add_nans=False,
+            ait_dtype=ait_dtype,
+        )
+        self._test_min_max(
+            [512, 512],
+            test_name=f"max_nan_{ait_dtype}",
+            is_min=False,
+            add_nans=True,
+            ait_dtype=ait_dtype,
+        )
 
     def _test_clamp(
         self,
@@ -452,10 +533,10 @@ def _test_clamp(
         test_name: str,
         ait_dtype,
     ) -> None:
-        assert len(input_size) == 2
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        assert len(input_size) == 2 or len(input_size) == 0
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X0 = Tensor(
-            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])] if input_size else [],
             dtype=ait_dtype,
             name="input0",
             is_input=True,
@@ -476,16 +557,25 @@ def _test_clamp(
 
         self.assertTrue(torch.allclose(x1, x1_pt, atol=1e-2, rtol=1e-2))
 
-    def test_clamp(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_clamp([512, 106], -1, 1, f"clamp_0_{ait_dtype}", ait_dtype)
-            self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
-            self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
-            self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_clamp(self, ait_dtype):
+        self._test_clamp([512, 106], -1, 1, f"clamp_0_{ait_dtype}", ait_dtype)
+        self._test_clamp([128, 46], None, 1, f"clamp_1_{ait_dtype}", ait_dtype)
+        self._test_clamp([56, 265], -1, None, f"clamp_2_{ait_dtype}", ait_dtype)
+        self._test_clamp([17, 123], 1, -1, f"clamp_3_{ait_dtype}", ait_dtype)
+        self._test_clamp([], 1, -1, f"clamp_4_{ait_dtype}", ait_dtype)
 
     def _test_operator_overload(self, ait_dtype):
         input_size = [4, 2]
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=input_size,
             dtype=ait_dtype,
@@ -513,20 +603,28 @@ def _test_operator_overload(self, ait_dtype):
         module.run_with_tensors([x1_pt, x2_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
-    def test_operator_overload(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_operator_overload(ait_dtype)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_operator_overload(self, ait_dtype):
+        self._test_operator_overload(ait_dtype)
 
     def _test_operator_overload_with_constant_number(self, ait_dtype):
         input_size = [4, 2]
-        torch_dtype = ait_dtype_to_pytorch[ait_dtype]
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
         X1 = Tensor(
             shape=input_size,
             dtype=ait_dtype,
             name="input0",
             is_input=True,
         )
-        OUTPUT = 10 / ops.tanh(X1 + 5) - ops.cos(10)
+        OUTPUT = 10 / ops.tanh(X1 + 5)
         OUTPUT._attrs["is_output"] = True
         OUTPUT._attrs["name"] = "output"
 
@@ -534,14 +632,92 @@ def _test_operator_overload_with_constant_number(self, ait_dtype):
         module = compile_model(OUTPUT, target, "./tmp", f"test_op_overload_{ait_dtype}")
 
         x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
-        output_pt = 10 / torch.tanh(x1_pt + 5) - math.cos(10)
+        output_pt = 10 / torch.tanh(x1_pt + 5)
         output = torch.empty(input_size).cuda().to(dtype=torch_dtype)
         module.run_with_tensors([x1_pt], [output])
         self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
 
-    def test_operator_overload_with_constant_number(self):
-        for ait_dtype in ait_dtype_to_pytorch.keys():
-            self._test_operator_overload_with_constant_number(ait_dtype)
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
+        )
+    )
+    def test_operator_overload_with_constant_number(self, ait_dtype):
+        self._test_operator_overload_with_constant_number(ait_dtype)
+
+
+class FusedElementwisePowerTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_power(self, input_size, exp, test_name, ait_dtype):
+        print(f"Running test {test_name} with exp = {exp}")
+        assert len(input_size) == 2
+        torch_dtype = _AIT_DTYPE_TO_PYTORCH_DTYPE[ait_dtype]
+        X1 = Tensor(
+            shape=[IntImm(input_size[0]), IntImm(input_size[1])],
+            dtype=ait_dtype,
+            name="input0",
+            is_input=True,
+        )
+        X2 = ops.elementwise(FuncEnum.POW)(X1, exp)
+        X2._attrs["is_output"] = True
+        X2._attrs["name"] = "output0"
+
+        target = detect_target()
+        module = compile_model(X2, target, "./tmp", test_name)
+
+        if abs(exp) < 1.0:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype) + 0.5
+        else:
+            x1_pt = torch.randn(input_size).cuda().to(dtype=torch_dtype)
+        x2_pt = torch.pow(x1_pt, exp)
+
+        x2 = torch.empty(input_size).cuda().to(dtype=torch_dtype)
+        module.run_with_tensors([x1_pt], [x2])
+        # t, _, _ = module.benchmark_with_tensors([x1_pt], [x2], count=1000)
+        # bw = input_size[0] * input_size[1] * 2 * 2 / (t * 1e9 * 1e-3)
+        # print(f"BW: {bw} GB/s")
+        torch.testing.assert_close(x2, x2_pt, atol=1e-3, rtol=1e-3, equal_nan=True)
+
+    @parameterized.expand(
+        itertools.product(
+            (0, 1, -1, 0.5, -0.5, 2, -2, 1.4, 3),
+            ([1024, 1024], [1025, 1025]),
+        )
+    )
+    def test_power_float16(self, exp, shape):
+        dtype = "float16"
+        self._test_power(
+            shape,
+            exp,
+            f"pow_{shape[0]}_{shape[1]}_{exp}_{dtype}",
+            dtype,
+        )
+
+    def test_power_float32_sm80(self):
+        self._test_power(
+            (1024, 1024),
+            2.5,
+            "pow_float32",
+            "float32",
+        )
+
+    def test_power_bfloat16_bf16(self):
+        self._test_power(
+            (1024, 1024),
+            1.2,
+            "pow_bfloat16",
+            "bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(FusedElementwisePowerTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_fused_elementwise_broadcast.py b/tests/unittest/ops/test_fused_elementwise_broadcast.py
index 5de8ea4b6..d0432bc46 100644
--- a/tests/unittest/ops/test_fused_elementwise_broadcast.py
+++ b/tests/unittest/ops/test_fused_elementwise_broadcast.py
@@ -25,19 +25,30 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import graph_utils, shape_utils
 
 
 class FusedElementwiseBroadcastTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _get_sorted_read_types(self, op):
+        read_types = list(op._attrs["read_types"])
+        return [t for _, t in sorted(read_types, key=lambda x: x[0])]
+
     def _test_different_dim(
         self,
         batch_sizes,
         ms,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests tanh(A(B, M, K) + B(M, K)).
@@ -49,13 +60,13 @@ def _test_different_dim(
 
         X1 = Tensor(
             shape=[batch_dim, m_dim, k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[m_dim, k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -66,74 +77,141 @@ def _test_different_dim(
         self.assertEqual(X4._attrs["shape"], [batch_dim, m_dim, k_dim])
 
         target = detect_target()
-        module = compile_model(
-            X4,
-            target,
-            "./tmp",
-            "fused_elementwise_different_dims_{}".format(test_name),
-        )
+        module = compile_model(X4, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, k in itertools.product(batch_sizes, ms, ks):
-            x1_pt = torch.randn(batch_size, m, k).cuda().half()
-            x2_pt = torch.randn(m, k).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, m, k], dtype=dtype)
+            x2_pt = get_random_torch_tensor([m, k], dtype=dtype)
             x4_pt = torch.tanh(x1_pt + x2_pt)
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x4 = torch.empty([batch_size, m, k]).cuda().half()
+            x4 = torch.empty_like(x4_pt)
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_different_dim(self):
+    def test_different_dim_fp16(self):
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[256],
             ks=[128],
-            test_name="static_shapes",
-            expected_read_t="uint4",
+            test_name="fused_elementwise_different_dim_fp16_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[23, 56, 1024],
             ms=[256],
             ks=[128],
-            test_name="dynamic_bs",
-            expected_read_t="uint4",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_bs",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[34, 67, 256],
             ks=[128],
-            test_name="dynamic_ms",
-            expected_read_t="uint4",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_ms",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[1024],
             ms=[256],
             ks=[34, 87, 128],
-            test_name="dynamic_ks",
-            expected_read_t="half",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_ks",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_different_dim(
             batch_sizes=[23, 1024],
             ms=[13, 256],
             ks=[34, 128],
-            test_name="dynamic_all",
-            expected_read_t="half",
+            test_name="fused_elementwise_different_dim_fp16_dynamic_all",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_different_dim_fp32(self):
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 56, 1024],
+            ms=[256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_bs",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[34, 67, 256],
+            ks=[128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_ms",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[1024],
+            ms=[256],
+            ks=[34, 87, 128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_ks",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_different_dim(
+            batch_sizes=[23, 1024],
+            ms=[13, 256],
+            ks=[34, 128],
+            test_name="fused_elementwise_different_dim_fp32_dynamic_all",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_1_shape(
@@ -143,9 +221,11 @@ def _test_1_shape(
         ns,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests tanh(A(B, 1, 1, M, K, 1) + B(N, N, 1, K, M)).
@@ -158,13 +238,13 @@ def _test_1_shape(
 
         X1 = Tensor(
             shape=[batch_dim, IntImm(1), IntImm(1), m_dim, k_dim, IntImm(1)],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[n_dim, n_dim, IntImm(1), k_dim, m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -177,89 +257,175 @@ def _test_1_shape(
         )
 
         target = detect_target()
-        module = compile_model(
-            X4,
-            target,
-            "./tmp",
-            "fused_elementwise_1_shape_{}".format(test_name),
-        )
+        module = compile_model(X4, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
-            x1_pt = torch.randn(batch_size, 1, 1, m, k, 1).cuda().half()
-            x2_pt = torch.randn(n, n, 1, k, m).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, 1, 1, m, k, 1], dtype=dtype)
+            x2_pt = get_random_torch_tensor([n, n, 1, k, m], dtype=dtype)
             x4_pt = torch.tanh(x1_pt + x2_pt)
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x4 = torch.empty([batch_size, n, n, m, k, m]).cuda().half()
+            x4 = torch.empty_like(x4_pt)
             module.run_with_tensors(inputs, [x4])
             self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-2, rtol=1e-2))
 
-    def test_1_shape(self):
+    def test_1_shape_fp16(self):
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="static_shapes",
-            expected_read_t="half",
-            expected_op_t="half",
+            test_name="fused_elementwise_test_1_fp16_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[23, 56, 1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_bs",
-            expected_read_t="half",
-            expected_op_t="half",
+            test_name="fused_elementwise_test_1_fp16_dynamic_bs",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[1, 3, 8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_ms",
-            expected_read_t="half",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ms",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[1, 3, 4],
             ks=[16],
-            test_name="dynamic_ns",
-            expected_read_t="half",
-            expected_op_t="half",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ns",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[1, 4, 7, 16],
-            test_name="dynamic_ks",
-            expected_read_t="half",
-            expected_op_t="half",
+            test_name="fused_elementwise_test_1_fp16_dynamic_ks",
+            expected_max_read_t="uint4",
+            expected_read_types=["half", "uint4"],
+            expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_1_shape(
             batch_sizes=[25, 1024],
             ms=[7, 8],
             ns=[3, 4],
             ks=[1, 16],
-            test_name="dynamic_all",
-            expected_read_t="half",
+            test_name="fused_elementwise_test_1_fp16_dynamic_all",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_1_shape_fp32(self):
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[23, 56, 1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_bs",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ms",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ns",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_ks",
+            expected_max_read_t="uint4",
+            expected_read_types=["float", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_1_shape(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="fused_elementwise_test_1_fp32_dynamic_all",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_chained_broadcasts(
@@ -269,9 +435,11 @@ def _test_chained_broadcasts(
         ns,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests A(B, 1, 1, M) + B(1, N, 1, M) + C(1, N, K, M).
@@ -284,19 +452,19 @@ def _test_chained_broadcasts(
 
         X1 = Tensor(
             shape=[batch_dim, IntImm(1), IntImm(1), m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntImm(1), n_dim, IntImm(1), m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
         X3 = Tensor(
             shape=[IntImm(1), n_dim, k_dim, m_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input2",
             is_input=True,
         )
@@ -308,100 +476,187 @@ def _test_chained_broadcasts(
         self.assertEqual(X5._attrs["shape"], [batch_dim, n_dim, k_dim, m_dim])
 
         target = detect_target()
-        module = compile_model(
-            X5,
-            target,
-            "./tmp",
-            "fused_elementwise_chained_broadcasts_{}".format(test_name),
-        )
+        module = compile_model(X5, target, "./tmp", test_name)
 
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for batch_size, m, n, k in itertools.product(batch_sizes, ms, ns, ks):
-            x1_pt = torch.randn(batch_size, 1, 1, m).cuda().half()
-            x2_pt = torch.randn(1, n, 1, m).cuda().half()
-            x3_pt = torch.randn(1, n, k, m).cuda().half()
+            x1_pt = get_random_torch_tensor([batch_size, 1, 1, m], dtype=dtype)
+            x2_pt = get_random_torch_tensor([1, n, 1, m], dtype=dtype)
+            x3_pt = get_random_torch_tensor([1, n, k, m], dtype=dtype)
             x5_pt = x3_pt + x1_pt + x2_pt
             inputs = {"input0": x1_pt, "input1": x2_pt, "input2": x3_pt}
-            x5 = torch.empty([batch_size, n, k, m]).cuda().half()
+            x5 = torch.empty_like(x5_pt)
             module.run_with_tensors(inputs, [x5])
             self.assertTrue(torch.allclose(x5, x5_pt, atol=1e-2, rtol=1e-2))
 
-    def test_chained_shapes(self):
+    def test_chained_shapes_fp16(self):
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[16],
-            test_name="static_shapes",
-            expected_read_t="uint4",
+            test_name="fused_elementwise_chained_broadcasts_fp16_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[23, 56, 1024],
             ms=[2],
             ns=[4],
             ks=[16],
-            test_name="dynamic_bs",
-            expected_read_t="uint",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_bs",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "uint"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[1, 3, 8],
             ns=[4],
             ks=[16],
-            test_name="dynamic_ms",
-            expected_read_t="half",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ms",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[4],
             ns=[1, 3, 4],
             ks=[16],
-            test_name="dynamic_ns",
-            expected_read_t="uint2",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ns",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[1024],
             ms=[8],
             ns=[4],
             ks=[1, 4, 7, 16],
-            test_name="dynamic_ks",
-            expected_read_t="uint4",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_ks",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
             expected_op_t="half2",
             expected_data_t="half",
+            dtype="float16",
         )
         self._test_chained_broadcasts(
             batch_sizes=[25, 1024],
             ms=[7, 8],
             ns=[3, 4],
             ks=[1, 16],
-            test_name="dynamic_all",
-            expected_read_t="half",
+            test_name="fused_elementwise_chained_broadcasts_fp16_dynamic_all",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_chained_shapes_fp32(self):
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[23, 56, 1024],
+            ms=[2],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_bs",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[1, 3, 8],
+            ns=[4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ms",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[4],
+            ns=[1, 3, 4],
+            ks=[16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ns",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[1024],
+            ms=[8],
+            ns=[4],
+            ks=[1, 4, 7, 16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_ks",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+        self._test_chained_broadcasts(
+            batch_sizes=[25, 1024],
+            ms=[7, 8],
+            ns=[3, 4],
+            ks=[1, 16],
+            test_name="fused_elementwise_chained_broadcasts_fp32_dynamic_all",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
 
     def _test_consecutive_1s_broadcast(
         self,
         ks,
         test_name,
-        expected_read_t,
+        expected_max_read_t,
+        expected_read_types,
         expected_op_t,
         expected_data_t,
+        dtype="float16",
     ):
         """
         Tests A(1, 1, K, 1, 1, K) / B(1, 1, 1, 1, 1, 1).
@@ -411,13 +666,13 @@ def _test_consecutive_1s_broadcast(
 
         X1 = Tensor(
             shape=[IntImm(1), IntImm(1), k_dim, IntImm(1), IntImm(1), k_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1), IntImm(1)],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -427,43 +682,279 @@ def _test_consecutive_1s_broadcast(
         self.assertEqual(X3._attrs["shape"], X1._attrs["shape"])
 
         target = detect_target()
-        module = compile_model(
-            X3,
-            target,
-            "./tmp",
-            "fused_elementwise_consecutive_1s_broadcast_{}".format(test_name),
-        )
+        module = compile_model(X3, target, "./tmp", test_name)
+
         debug_sorted_graph = module.debug_sorted_graph
         sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
         self.assertEqual(len(sorted_ops), 1)
         self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
-        self.assertEqual(sorted_ops[0]._attrs["read_t"], expected_read_t)
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
         self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
         self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
 
         for k in ks:
-            x1_pt = torch.randn(1, 1, k, 1, 1, k).cuda().half()
-            x2_pt = torch.randn(1, 1, 1, 1, 1, 1).cuda().half()
+            x1_pt = get_random_torch_tensor([1, 1, k, 1, 1, k], dtype=dtype)
+            x2_pt = get_random_torch_tensor([1, 1, 1, 1, 1, 1], dtype=dtype)
             x3_pt = x1_pt / x2_pt
             inputs = {"input0": x1_pt, "input1": x2_pt}
-            x3 = torch.empty([1, 1, k, 1, 1, k]).cuda().half()
+            x3 = torch.empty_like(x3_pt)
             module.run_with_tensors(inputs, [x3])
             self.assertTrue(torch.allclose(x3, x3_pt, atol=1e-2, rtol=1e-2))
 
-    def test_consecutive_1s_broadcast(self):
+    def test_consecutive_1s_broadcast_fp16(self):
         self._test_consecutive_1s_broadcast(
             ks=[32],
-            test_name="static_shapes",
-            expected_read_t="half",
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp16_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_consecutive_1s_broadcast(
+            ks=[1, 5, 7, 32],
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp16_dynamic_shapes",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_consecutive_1s_broadcast_fp32(self):
+        self._test_consecutive_1s_broadcast(
+            ks=[32],
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp32_static_shapes",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
         )
         self._test_consecutive_1s_broadcast(
             ks=[1, 5, 7, 32],
-            test_name="dynamic_shapes",
-            expected_read_t="half",
+            test_name="fused_elementwise_consecutive_1s_broadcast_fp32_dynamic_shapes",
+            expected_max_read_t="float",
+            expected_read_types=["float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float32",
+        )
+
+    def _test_vectorization(
+        self,
+        batch_sizes,
+        ms,
+        ks,
+        test_name,
+        expected_max_read_t,
+        expected_read_types,
+        expected_op_t,
+        expected_data_t,
+        ns=None,
+        dtype="float16",
+    ):
+        """
+        Test add(add(X0(B, M0, K0, N0), X1(B, M1, K1, N1)), X2(B, M2, K2, N2))
+        """
+
+        batch_dim = shape_utils.gen_int_var_min_max(batch_sizes, name="batch_dim")
+        if ns is None:
+            ns = [1, 1, 1]
+
+        X0 = Tensor(
+            shape=[batch_dim, IntImm(ms[0]), IntImm(ks[0]), IntImm(ns[0])],
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[batch_dim, IntImm(ms[1]), IntImm(ks[1]), IntImm(ns[1])],
+            dtype=dtype,
+            name="input1",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=[batch_dim, IntImm(ms[2]), IntImm(ks[2]), IntImm(ns[2])],
+            dtype=dtype,
+            name="input2",
+            is_input=True,
+        )
+        add_1 = ops.elementwise(FuncEnum.ADD)(X0, X1)
+        output = ops.elementwise(FuncEnum.ADD)(add_1, X2)
+        output._attrs["name"] = "output0"
+        output._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(output, target, "./tmp", test_name)
+
+        debug_sorted_graph = module.debug_sorted_graph
+        sorted_ops = graph_utils.get_sorted_ops(debug_sorted_graph)
+        self.assertEqual(len(sorted_ops), 1)
+        self.assertEqual(sorted_ops[0]._attrs["op"], "fused_elementwise")
+        self.assertEqual(sorted_ops[0]._attrs["max_read_t"], expected_max_read_t)
+        self.assertEqual(
+            self._get_sorted_read_types(sorted_ops[0]), expected_read_types
+        )
+        self.assertEqual(sorted_ops[0]._attrs["op_t"], expected_op_t)
+        self.assertEqual(sorted_ops[0]._attrs["data_t"], expected_data_t)
+
+        for batch_size in batch_sizes:
+            x0_pt = get_random_torch_tensor(
+                [batch_size, ms[0], ks[0], ns[0]], dtype=dtype
+            )
+            x1_pt = get_random_torch_tensor(
+                [batch_size, ms[1], ks[1], ns[1]], dtype=dtype
+            )
+            x2_pt = get_random_torch_tensor(
+                [batch_size, ms[2], ks[2], ns[2]], dtype=dtype
+            )
+            output_pt = (x0_pt + x1_pt) + x2_pt
+            inputs = {"input0": x0_pt, "input1": x1_pt, "input2": x2_pt}
+            output = torch.empty_like(output_pt)
+            module.run_with_tensors(inputs, [output])
+            self.assertTrue(torch.allclose(output, output_pt, atol=1e-2, rtol=1e-2))
+
+    def test_vectorization_fp16(self):
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 2],
+            ks=[2, 2, 1],
+            test_name="fused_elementwise_vectorization_fp16_1",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[4, 1024],
+            ms=[1, 15, 1],
+            ks=[4, 4, 1],
+            test_name="fused_elementwise_vectorization_fp16_2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[10, 12],
+            ms=[1, 1, 1],
+            ks=[16, 1, 16],
+            test_name="fused_elementwise_vectorization_fp16_3",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half", "uint4"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[8],
+            ms=[8, 1, 8],
+            ks=[127, 127, 1],
+            test_name="fused_elementwise_vectorization_fp16_4",
+            expected_max_read_t="half",
+            expected_read_types=["half", "half", "half"],
             expected_op_t="half",
             expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[8],
+            ms=[8, 1, 8],
+            ks=[1, 1, 1],
+            test_name="fused_elementwise_vectorization_fp16_5",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "half", "uint4"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 1],
+            ks=[6, 6, 6],
+            test_name="fused_elementwise_vectorization_fp16_6",
+            expected_max_read_t="uint",
+            expected_read_types=["uint", "uint", "uint"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 1],
+            ks=[12, 12, 12],
+            test_name="fused_elementwise_vectorization_fp16_7",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[4, 1, 1],
+            ks=[2, 2, 1],
+            ns=[1, 2, 1],
+            test_name="fused_elementwise_vectorization_fp16_8",
+            expected_max_read_t="uint",
+            expected_read_types=["half", "uint", "half"],
+            expected_op_t="half2",
+            expected_data_t="half",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_vectorization_fp32(self):
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 1, 2],
+            ks=[4, 1, 1],
+            test_name="fused_elementwise_vectorization_fp32_1",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "float", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1, 128],
+            ms=[2, 1, 1],
+            ks=[2, 2, 1],
+            test_name="fused_elementwise_vectorization_fp32_2",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "float"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 2],
+            ks=[8, 8, 8],
+            test_name="fused_elementwise_vectorization_fp32_3",
+            expected_max_read_t="uint4",
+            expected_read_types=["uint4", "uint4", "uint4"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
+        )
+        self._test_vectorization(
+            batch_sizes=[1],
+            ms=[2, 2, 1],
+            ks=[2, 2, 2],
+            test_name="fused_elementwise_vectorization_fp32_4",
+            expected_max_read_t="uint2",
+            expected_read_types=["uint2", "uint2", "uint2"],
+            expected_op_t="float",
+            expected_data_t="float",
+            dtype="float",
         )
 
 
diff --git a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
index dff0f67b9..b5398eb38 100644
--- a/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
+++ b/tests/unittest/ops/test_fused_elementwise_with_strided_outputs.py
@@ -26,12 +26,26 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 class FusedElementwiseWithStridedOutputsTestCase(unittest.TestCase):
-    def _fused_elementwise_e2e_helper(
-        self, batch0_sizes: List[int], batch1_sizes: List[int], m1: int, m2: int, k: int
+    def __init__(self, *args, **kwargs):
+        super(FusedElementwiseWithStridedOutputsTestCase, self).__init__(
+            *args, **kwargs
+        )
+        self._test_id = 0
+
+    def _test_fused_elementwise_with_strided_outputs(
+        self,
+        batch0_sizes: List[int],
+        batch1_sizes: List[int],
+        m1: int,
+        m2: int,
+        k: int,
+        test_name: str = "fused_elementwise_with_strided_outputs",
+        dtype: str = "float16",
     ):
         # Construct one graph with 2 fused_elementwises + 1 cat.
         batch0_dim = shape_utils.gen_int_var_min_max(batch0_sizes, "batch0")
@@ -44,13 +58,13 @@ def _fused_elementwise_e2e_helper(
                 IntImm(m1),
                 IntImm(k),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input0",
             is_input=True,
         )
         X2 = Tensor(
             shape=[],
-            dtype="float16",
+            dtype=dtype,
             name="X2",
             value=3.0,
         )
@@ -61,7 +75,7 @@ def _fused_elementwise_e2e_helper(
                 IntImm(m2),
                 IntImm(k),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input1",
             is_input=True,
         )
@@ -79,13 +93,20 @@ def _fused_elementwise_e2e_helper(
             [X7],
             target,
             "./tmp",
-            f"fused_elementwise_with_strided_outputs_m1_{m1}_m2_{m2}_k_{k}",
+            f"{test_name}_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch0_size in batch0_sizes:
                 for batch1_size in batch1_sizes:
                     # Run PyTorch baseline.
-                    x1_pt = torch.randn(batch0_size, batch1_size, m1, k).cuda().half()
-                    x3_pt = torch.randn(batch0_size, batch1_size, m2, k).cuda().half()
+                    x1_pt = get_random_torch_tensor(
+                        [batch0_size, batch1_size, m1, k],
+                        dtype=dtype,
+                    )
+                    x3_pt = get_random_torch_tensor(
+                        [batch0_size, batch1_size, m2, k],
+                        dtype=dtype,
+                    )
                     x5_pt = torch.tanh(x1_pt + 3.0)
                     x6_pt = torch.tanh(x3_pt)
                     x7_pt = torch.cat([x5_pt, x6_pt], dim=2)
@@ -96,46 +117,184 @@ def _fused_elementwise_e2e_helper(
                     inputs[name_to_index_map["input0"]] = x1_pt
                     inputs[name_to_index_map["input1"]] = x3_pt
 
-                    x7 = (
-                        torch.empty([batch0_size, batch1_size, m1 + m2, k])
-                        .cuda()
-                        .half()
-                    )
+                    x7 = torch.empty_like(x7_pt)
                     module.run_with_tensors(inputs, [x7])
                     # Do comparisons.
                     self.assertTrue(torch.allclose(x7, x7_pt, atol=1e-2, rtol=1e-2))
 
-    def test_all_aligned(self):
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1], batch1_sizes=[2, 4, 5], m1=8, m2=8, k=1
+    def test_all_aligned_fp16(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1],
+            batch1_sizes=[2, 4, 5],
+            m1=8,
+            m2=8,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1, 99, 1024],
+            batch1_sizes=[8],
+            m1=8,
+            m2=16,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[3, 5, 1024],
+            batch1_sizes=[2, 5],
+            m1=4,
+            m2=4,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=4,
+            m2=2,
+            k=4,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=16,
+            m2=64,
+            k=32,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_all_aligned_fp32(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1],
+            batch1_sizes=[2, 4, 5],
+            m1=8,
+            m2=8,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1, 99, 1024],
+            batch1_sizes=[8],
+            m1=8,
+            m2=16,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[3, 5, 1024],
+            batch1_sizes=[2, 5],
+            m1=4,
+            m2=4,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1, 99, 1024], batch1_sizes=[8], m1=8, m2=16, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=4,
+            m2=2,
+            k=4,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[3, 5, 1024], batch1_sizes=[2, 5], m1=4, m2=4, k=2
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[1024],
+            batch1_sizes=[2],
+            m1=16,
+            m2=64,
+            k=32,
+            test_name="fused_elementwise_with_strided_outputs_all_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1024], batch1_sizes=[2], m1=4, m2=2, k=4
+
+    def test_not_aligned_fp16(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[8],
+            batch1_sizes=[23, 88, 100],
+            m1=1,
+            m2=1,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[88, 100, 234],
+            batch1_sizes=[40],
+            m1=4,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[1024], batch1_sizes=[2], m1=16, m2=64, k=32
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[23, 56, 93],
+            batch1_sizes=[12, 34, 55],
+            m1=1,
+            m2=2,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
+        )
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[2],
+            batch1_sizes=[1024],
+            m1=8,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp16",
+            dtype="float16",
         )
 
-    def test_not_aligned(self):
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[8], batch1_sizes=[23, 88, 100], m1=1, m2=1, k=1
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_not_aligned_fp32(self):
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[8],
+            batch1_sizes=[23, 88, 100],
+            m1=1,
+            m2=1,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[88, 100, 234], batch1_sizes=[40], m1=4, m2=2, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[88, 100, 234],
+            batch1_sizes=[40],
+            m1=4,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[23, 56, 93], batch1_sizes=[12, 34, 55], m1=1, m2=2, k=2
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[23, 56, 93],
+            batch1_sizes=[12, 34, 55],
+            m1=1,
+            m2=2,
+            k=2,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
-        self._fused_elementwise_e2e_helper(
-            batch0_sizes=[2], batch1_sizes=[1024], m1=8, m2=2, k=1
+        self._test_fused_elementwise_with_strided_outputs(
+            batch0_sizes=[2],
+            batch1_sizes=[1024],
+            m1=8,
+            m2=2,
+            k=1,
+            test_name="fused_elementwise_with_strided_outputs_not_aligned_fp32",
+            dtype="float32",
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gather.py b/tests/unittest/ops/test_gather.py
index dcfec0d02..2ff6b3927 100644
--- a/tests/unittest/ops/test_gather.py
+++ b/tests/unittest/ops/test_gather.py
@@ -22,30 +22,50 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GatherTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(GatherTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
 
-    def _run_gather_test(self, *, input_shape, gather_dim, dim_size, index_shape=None):
-        logging.info(
-            "Test with input_shape {}, gather_dim {}".format(input_shape, gather_dim)
-        )
+    def _run_gather_test(
+        self,
+        *,
+        input_shape,
+        gather_dim,
+        dim_size,
+        index_shape=None,
+        test_name="gather",
+        input_type="float16",
+        index_type="int64",
+    ):
+        logging.info(f"Test with input_shape {input_shape}, gather_dim {gather_dim}")
 
-        input_type = "float16"
-        index_type = "int64"
         if index_shape is None:
             index_shape = [
                 random.randint(0, d - 1) if i != gather_dim else dim_size
                 for (i, d) in enumerate(input_shape)
             ]
-        logging.info("index_shape {}".format(index_shape))
+        logging.info(f"index_shape {index_shape}")
 
-        X = Tensor(shape=input_shape, dtype=input_type, name="X", is_input=True)
-        Index = Tensor(shape=index_shape, dtype=index_type, name="Index", is_input=True)
+        X = Tensor(
+            shape=input_shape,
+            dtype=input_type,
+            name="X",
+            is_input=True,
+        )
+        Index = Tensor(
+            shape=index_shape,
+            dtype=index_type,
+            name="Index",
+            is_input=True,
+        )
         gather_op = ops.gather()
         Y = gather_op(X, gather_dim, Index)
         Y._attrs["name"] = "output"
@@ -54,34 +74,131 @@ def _run_gather_test(self, *, input_shape, gather_dim, dim_size, index_shape=Non
         np.testing.assert_equal(y_shape, index_shape)
 
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "gather")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         X_pt = get_random_torch_tensor(input_shape, input_type)
         Index_pt = torch.randint(
-            input_shape[gather_dim], size=index_shape, dtype=torch.int64
-        ).cuda()
+            input_shape[gather_dim],
+            size=index_shape,
+            dtype=torch.int64,
+            device="cuda",
+        )
         Y_pt = torch.gather(X_pt, gather_dim, Index_pt)
         Y_np = Y_pt.cpu().numpy()
         np.testing.assert_equal(y_shape, Y_np.shape)
 
         Index_pt = Index_pt.to(torch.int64)
         inputs = {"X": X_pt, "Index": Index_pt}
-        y = torch.empty(y_shape).cuda().half()
+        y = get_torch_empty_tensor(y_shape, dtype=input_type)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_gather(self):
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=1)
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=2)
-        self._run_gather_test(input_shape=[2], gather_dim=0, dim_size=3)
+    def test_gather_fp16(self):
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=1,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=2,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=3,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=1,
+            dim_size=4,
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=0,
+            dim_size=2,
+            index_shape=[7, 1, 4],
+            test_name="gather_fp16",
+            input_type="float16",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            index_shape=[0, 1, 2],
+            test_name="gather_fp16",
+            input_type="float16",
+        )
 
-        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=2, dim_size=7)
-        self._run_gather_test(input_shape=[3, 4, 5], gather_dim=1, dim_size=4)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_gather_fp32(self):
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=1,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=2,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[2],
+            gather_dim=0,
+            dim_size=3,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
+        self._run_gather_test(
+            input_shape=[3, 4, 5],
+            gather_dim=1,
+            dim_size=4,
+            test_name="gather_fp32",
+            input_type="float32",
+        )
         self._run_gather_test(
-            input_shape=[3, 4, 5], gather_dim=0, dim_size=2, index_shape=[7, 1, 4]
+            input_shape=[3, 4, 5],
+            gather_dim=0,
+            dim_size=2,
+            index_shape=[7, 1, 4],
+            test_name="gather_fp32",
+            input_type="float32",
         )
         self._run_gather_test(
-            input_shape=[3, 4, 5], gather_dim=2, dim_size=7, index_shape=[0, 1, 2]
+            input_shape=[3, 4, 5],
+            gather_dim=2,
+            dim_size=7,
+            index_shape=[0, 1, 2],
+            test_name="gather_fp32",
+            input_type="float32",
         )
 
 
diff --git a/tests/unittest/ops/test_gemm.py b/tests/unittest/ops/test_gemm.py
index 15ad326cd..d9585c249 100644
--- a/tests/unittest/ops/test_gemm.py
+++ b/tests/unittest/ops/test_gemm.py
@@ -20,57 +20,97 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-2, "rtol": 1e-2},
+    "float32": {"atol": 3e-2, "rtol": 3e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
 
 
 class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, ms, k, n, test_name):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(GEMMTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
-
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m, n]).cuda().half()
+            y = get_torch_empty_tensor([m, n], dtype)
             module.run_with_tensors(inputs, [y])
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                print(f"Processing m={m}")
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_rcr(self):
+    def test_rcr_simple_static(self) -> None:
         self._test_rcr([1024], 256, 512, "static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 1024], 256, 512, "dynamic1")
-            self._test_rcr([1, 99, 84, 987, 1024], 128, 8, "dynamic2")
-            self._test_rcr([8], 0, 4, "zero_k")
-            self._test_rcr([0], 8, 4, "zero_m")
 
-    def _test_rcr_dynamic_n(self, ms, k, ns, test_name):
+    def test_rcr_simple_static_rocm(self) -> None:
+        self._test_rcr([1024], 256, 512, "static_rocm")
+
+    @parameterized.expand(
+        [
+            ("dynamic1", [1, 1024], 256, 512),
+            # TODO/FIXME: Fix the issue below.
+            # There is some bug with floating point rounding,
+            # e.g. the list of batch sizes like this [1, 99, 84, 987, 1024]
+            # is not handled properly.
+            ("dynamic2", [1, 99, 84, 1024], 128, 8),
+            ("zero_k", [8], 0, 4),
+            ("zero_m", [0], 8, 4),
+        ]
+    )
+    def test_rcr_simple_dynamic(self, name, ms, k, n) -> None:
+        self._test_rcr(ms, k, n, name)
+
+    def _test_rcr_dynamic_n(self, ms, k, ns, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ns), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -78,158 +118,352 @@ def _test_rcr_dynamic_n(self, ms, k, ns, test_name):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m in ms:
             for n in ns:
-                X_pt = torch.randn(m, k).cuda().half()
-                W_pt = torch.randn(n, k).cuda().half()
+                X_pt = get_random_torch_tensor([m, k], dtype)
+                W_pt = get_random_torch_tensor([n, k], dtype)
                 Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
                 inputs = {"input_0": X_pt, "input_1": W_pt}
-                y = torch.empty([m, n]).cuda().half()
+                y = get_torch_empty_tensor([m, n], dtype)
                 module.run_with_tensors(inputs, [y])
 
-                # from aitemplate.testing.benchmark_pt import benchmark_torch_function
-                # module.benchmark_with_tensors(inputs, [y], count=1000)
-                # t = benchmark_torch_function(1000, torch.nn.functional.linear, X_pt, W_pt)
-                # print(f"pt: {t} ms")
-
                 if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                     pass
                 else:
-                    self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                    torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     def test_rcr_dynamic_n(self):
-        self._test_rcr([16, 1 * 29, 64], 256, 300000, "umia_einsum_1")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1")
+        self._test_rcr_dynamic_n(
+            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n"
+        )
+
+    def test_rcr_dynamic_n_rocm(self):
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_rocm")
         self._test_rcr_dynamic_n(
-            [16, 1 * 29, 64], 256, [100000, 300000], "umia_einsum_dynamic_n"
+            [16, 1 * 29, 64], 256, [100000, 300000], "einsum_dynamic_n_rocm"
         )
 
-    def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name):
+    def _test_3d_2d_rcr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        if dtype == "float16":
+            tolerance_limits["atol"] = 2e-2
+            tolerance_limits["rtol"] = 2e-2
         X = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(m0s),
                 shape_utils.gen_int_var_min_max(m1s),
                 k,
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         X._attrs["is_input"] = True
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", "gemm_3d_2d_rcr_{}".format(test_name)
+            Y, target, "./tmp", f"gemm_3d_2d_rcr_{test_name}_{self._test_id}"
         )
+        self._test_id += 1
 
         for m0, m1 in itertools.product(m0s, m1s):
-            X_pt = torch.randn(m0, m1, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
+            X_pt = get_random_torch_tensor([m0, m1, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m0, m1, n]).cuda().half()
+            y = get_torch_empty_tensor([m0, m1, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rcr(self):
         self._test_3d_2d_rcr([1024], [2], 256, 512, "static")
         self._test_3d_2d_rcr([1, 1024], [2], 256, 512, "dynamic1")
         self._test_3d_2d_rcr([3], [128, 256], 256, 512, "dynamic2")
         self._test_3d_2d_rcr([1, 99, 1024], [1, 2], 128, 8, "dynamic3")
 
-    def _test_rrr(self, ms, k, n, test_name):
+    def _test_rrr(self, ms, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        if dtype == "float16":
+            tolerance_limits["atol"] = 2e-2
+            tolerance_limits["rtol"] = 2e-2
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_pt = torch.matmul(X_pt, W_pt)
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m, n]).cuda().half()
+            y = get_torch_empty_tensor([m, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
     def test_rrr(self):
         self._test_rrr([256], 128, 32, "static")
-        if detect_target().name() == "cuda":
-            self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
+        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic")
 
-    def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name):
+    def test_rrr_rocm(self):
+        self._test_rrr([256], 128, 32, "static_rocm")
+
+    def _test_3d_2d_rrr(self, m0s, m1s, k, n, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = {"atol": 2e-1, "rtol": 2e-1}
         X = Tensor(
             shape=[
                 shape_utils.gen_int_var_min_max(m0s),
                 shape_utils.gen_int_var_min_max(m1s),
                 k,
             ],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_3d_2d_rrr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for m0, m1 in itertools.product(m0s, m1s):
-            X_pt = torch.randn(m0, m1, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m0, m1, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_pt = torch.matmul(X_pt, W_pt)
 
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty([m0, m1, n]).cuda().half()
+            y = get_torch_empty_tensor([m0, m1, n], dtype)
             module.run_with_tensors(inputs, [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
     def test_3d_2d_rrr(self):
         self._test_3d_2d_rrr([256], [2], 128, 32, "static")
         self._test_3d_2d_rrr([1, 128], [3], 256, 16, "dynamic1")
         self._test_3d_2d_rrr([2], [24, 36], 256, 16, "dynamic2")
         self._test_3d_2d_rrr([2, 34, 48], [1, 3, 5], 256, 16, "dynamic3")
 
-    def test_h_rcr(self):
+    def _test_h_rcr(self, ait_dtype, test_name=None):
+        if test_name is None:
+            test_name = ait_dtype
+
         M = 256
         K = 256
         N = 512
-        target = detect_target(use_fp16_acc=True)
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        target = detect_target(use_fp16_acc=(ait_dtype == "float16"))
+        X = Tensor(shape=[M, K], dtype=ait_dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=ait_dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "hgemm_rcr")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        module = compile_model(
+            Y, target, "./tmp", f"hgemm_rcr_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
+        X_pt = get_random_torch_tensor((M, K), ait_dtype)
+        W_pt = get_random_torch_tensor((N, K), ait_dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor((M, N), ait_dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    def test_h_rcr_float16(self):
+        self._test_h_rcr(ait_dtype="float16")
+
+    def test_h_rcr_float16_rocm(self):
+        self._test_h_rcr(ait_dtype="float16", test_name="float16_rocm")
+
+    def test_h_rcr_float32_sm80(self):
+        self._test_h_rcr(ait_dtype="float32")
+
+    def test_h_rcr_bfloat16_bf16(self):
+        self._test_h_rcr(ait_dtype="bfloat16")
+
+    def test_gemm_float32_sm80(self):
+        self._test_rcr([1024], 256, 512, "static_float", dtype="float32")
+        self._test_rcr([1, 1024], 256, 512, "dynamic1_float", dtype="float32")
+        self._test_rcr([16, 1 * 29, 64], 256, 300000, "einsum_1_float", dtype="float32")
+
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_float", dtype="float32")
+        self._test_3d_2d_rcr(
+            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_float", dtype="float32"
+        )
+
+        self._test_rrr([256], 128, 32, "static_float", dtype="float32")
+        self._test_rrr([1, 99, 1024, 2048], 256, 16, "dynamic_float", dtype="float32")
+
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static_float", dtype="float32")
+        self._test_3d_2d_rrr(
+            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_float", dtype="float32"
+        )
+
+    def test_gemm_bfloat16_bf16(self):
+        self._test_rcr([1024], 256, 512, "static_bfloat16", dtype="bfloat16")
+        self._test_rcr([1, 1024], 256, 512, "dynamic1_bfloat16", dtype="bfloat16")
+        self._test_rcr(
+            [16, 1 * 29, 64], 256, 300000, "einsum_1_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_3d_2d_rcr([1024], [2], 256, 512, "static_bfloat16", dtype="bfloat16")
+        self._test_3d_2d_rcr(
+            [1, 99, 1024], [1, 2], 128, 8, "dynamic3_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_rrr([256], 128, 32, "static_bfloat16", dtype="bfloat16")
+        self._test_rrr(
+            [1, 99, 1024, 2048], 256, 16, "dynamic_bfloat16", dtype="bfloat16"
+        )
+
+        self._test_3d_2d_rrr([256], [2], 128, 32, "static_bfloat16", dtype="bfloat16")
+        self._test_3d_2d_rrr(
+            [2, 34, 48], [1, 3, 5], 256, 16, "dynamic3_bfloat16", dtype="bfloat16"
+        )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    ms=[1, 1024],
+                    k=252,
+                    n=512,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                ms=[1, 1024],
+                k=256,
+                n=512,
+                test_name="dynamic_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_rcr_dynamic_n(
+                ms=[16, 1 * 29, 64],
+                k=256,
+                ns=[100000, 300000],
+                test_name="einsum_dynamic_n_force_sm90",
+                dtype="float16",
+            )
+            self._test_3d_2d_rcr(
+                m0s=[1, 99, 1024],
+                m1s=[1, 2],
+                k=128,
+                n=8,
+                test_name="dynamic3_force_sm90",
+                dtype="float16",
+            )
+            self._test_h_rcr(
+                ait_dtype="float16",
+                test_name="float16_force_sm90",
+            )
+
+            self._test_rcr(
+                ms=[1024],
+                k=256,
+                n=512,
+                test_name="static_float_forse_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                ms=[1024],
+                k=256,
+                n=512,
+                test_name="static_bfloat16_forse_sm90",
+                dtype="bfloat16",
+            )
+
+    def test_rrr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rrr(
+                    ms=[1, 99, 1024, 2048],
+                    k=252,
+                    n=16,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rrr(
+                ms=[1, 99, 1024, 2048],
+                k=256,
+                n=16,
+                test_name="dynamic_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_3d_2d_rrr(
+                m0s=[2, 34, 48],
+                m1s=[1, 3, 5],
+                k=256,
+                n=16,
+                test_name="dynamic3_force_sm90",
+                dtype="float16",
+            )
+
+            self._test_rrr(
+                ms=[256],
+                k=128,
+                n=32,
+                test_name="static_float_force_sm90",
+                dtype="float32",
+            )
+            self._test_rrr(
+                ms=[256],
+                k=128,
+                n=32,
+                test_name="static_bfloat16_force_sm90",
+                dtype="bfloat16",
+            )
+
+
+filter_test_cases_by_test_env(GEMMTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias.py b/tests/unittest/ops/test_gemm_bias.py
index cbe0b9ce6..330a030f2 100644
--- a/tests/unittest/ops/test_gemm_bias.py
+++ b/tests/unittest/ops/test_gemm_bias.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
 import torch
@@ -21,35 +20,52 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
-class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, N, K, test_name):
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
+class GEMMBiasTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(self, Ms, N, K, test_name, dtype="float16"):
         target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_{test_name}")
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rcr_bias_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
 
         for M in Ms:
-            logging.info(f"Testing {M=}")
-
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
 
-            y = torch.empty([M, N]).half().cuda()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
@@ -57,24 +73,153 @@ def _test_rcr(self, Ms, N, K, test_name):
             if X_pt.nelement() == 0 or W_pt.nelement() == 0:
                 pass
             else:
-                self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
 
-    def test_rcr(self):
+    def test_rcr_zero_size(self):
         target = detect_target()
-        self._test_rcr([128], N=64, K=1024, test_name="static")
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
+        if type(target).__name__ != "FBCUDA":
+            self._test_rcr([2], N=64, K=0, test_name="zero_k")
+        self._test_rcr([2], N=0, K=4, test_name="zero_n")
+        self._test_rcr([0], N=4, K=4, test_name="zero_m")
+
+    def test_rcr_static(self):
+        self._test_rcr([4096], N=4, K=4, test_name="static")
+        self._test_rcr([1000], N=81, K=1024, test_name="static")
+        self._test_rcr([67200], N=3, K=256, test_name="static")
+
+    def test_rcr_static_rocm(self):
         self._test_rcr([4096], N=4, K=4, test_name="static")
         self._test_rcr([1000], N=81, K=1024, test_name="static")
         self._test_rcr([67200], N=3, K=256, test_name="static")
-        if target.name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], N=64, K=1024, test_name="dynamic_m")
-            # This test triggered a c10 assertion failure internally
-            # caffe2/c10/util/SmallVector.h:338:
-            # Assertion `idx < size()' failed
-            if type(target).__name__ != "FBCUDA":
-                self._test_rcr([2], N=64, K=0, test_name="zero_k")
-            self._test_rcr([2], N=0, K=4, test_name="zero_n")
-            self._test_rcr([0], N=4, K=4, test_name="zero_m")
+
+    def test_rcr_bfloat16_bf16(self):
+        dtype = "bfloat16"
+        self._test_rcr([4], N=2, K=11, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr([128], N=64, K=1024, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rcr(
+            [1, 7, 64, 127],
+            N=64,
+            K=1024,
+            test_name=f"dynamic_m_{dtype}",
+            dtype=dtype,
+        )
+
+    def test_rcr_sm90(self) -> None:
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+            FORCE_PROFILE="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_rcr(
+                    Ms=[128],
+                    N=32,
+                    K=28,
+                    test_name="wrong_alignment_force_sm90",
+                    dtype="float16",
+                )
+
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_fp32_force_sm90",
+                dtype="float32",
+            )
+            self._test_rcr(
+                Ms=[128],
+                N=32,
+                K=32,
+                test_name="static_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+    def _test_rrr(self, Ms, N, K, test_name, dtype="float16"):
+        target = detect_target()
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(
+            shape=[IntImm(K), IntImm(N)], dtype=dtype, name="input_1", is_input=True
+        )
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
+        OP = ops.gemm_rrr_bias()
+        Y = OP(X, W, B)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_bias_{test_name}_{self._test_id}"
+        )
+        self._test_id += 1
+
+        for M in Ms:
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
+            Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
+
+            W_transpose_pt = torch.transpose(W_pt, 0, 1).contiguous()
+            y = get_torch_empty_tensor([M, N], dtype)
+            module.run_with_tensors(
+                {"input_0": X_pt, "input_1": W_transpose_pt, "input_2": B_pt},
+                [y],
+            )
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                torch.testing.assert_close(Y_pt, y, **tolerance_limits)
+
+    def test_rrr_zero_size(self):
+        target = detect_target()
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
+        if type(target).__name__ != "FBCUDA":
+            self._test_rrr([2], N=64, K=0, test_name="zero_k")
+        self._test_rrr([2], N=0, K=4, test_name="zero_n")
+        self._test_rrr([0], N=4, K=4, test_name="zero_m")
+
+    def test_rrr_static(self):
+        self._test_rrr([4096], N=4, K=4, test_name="static")
+        self._test_rrr([1000], N=81, K=1024, test_name="static")
+        self._test_rrr([67200], N=3, K=256, test_name="static")
+
+    def test_rrr_static_rocm(self):
+        self._test_rrr([4096], N=4, K=4, test_name="static")
+        self._test_rrr([1000], N=81, K=1024, test_name="static")
+        self._test_rrr([67200], N=3, K=256, test_name="static")
+
+    def test_rrr_bfloat16_bf16(self):
+        dtype = "bfloat16"
+        self._test_rrr([4], N=2, K=11, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rrr([128], N=64, K=1024, test_name=f"static_{dtype}", dtype=dtype)
+        self._test_rrr(
+            [1, 7, 64, 127],
+            N=64,
+            K=1024,
+            test_name=f"dynamic_m_{dtype}",
+            dtype=dtype,
+        )
+
+
+filter_test_cases_by_test_env(GEMMBiasTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_broadcast.py b/tests/unittest/ops/test_gemm_bias_broadcast.py
index ef2ccc365..b05a3f22e 100644
--- a/tests/unittest/ops/test_gemm_bias_broadcast.py
+++ b/tests/unittest/ops/test_gemm_bias_broadcast.py
@@ -14,32 +14,50 @@
 #
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+from parameterized import parameterized
+
+
+def custom_name_func_with_funcname(testcase_func, param_num, param):
+    return "%s_%s_%s" % (
+        testcase_func.__name__[:-5],
+        str(param.args[0].__name__),
+        testcase_func.__name__[-4:],
+    )
 
 
 class GEMMBiasBroadcastTestCase(unittest.TestCase):
-    def _init_tensors(self, m, k, n, m0=None, m1=None):
+    def _init_tensors(self, m, k, n, m0=None, m1=None, dtype="float16"):
         m_shape = [m] if m is not None else [m0, m1]
-        self.X = Tensor(
-            shape=m_shape + [k], dtype="float16", name="input_0", is_input=True
-        )
-        self.W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        self.B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
-        self.D0 = Tensor(shape=m_shape + [n], dtype="float16", name="d0", is_input=True)
-        self.D1 = Tensor(shape=m_shape + [n], dtype="float16", name="d1", is_input=True)
-        self.X_pt = torch.randn(*m_shape, k).cuda().half()
-        self.W_pt = torch.randn(n, k).cuda().half()
-        self.B_pt = torch.randn(n).cuda().half()
-        self.D0_pt = torch.randn(*m_shape, n).cuda().half()
-        self.D1_pt = torch.randn(*m_shape, n).cuda().half()
+        self.X = Tensor(shape=m_shape + [k], dtype=dtype, name="input_0", is_input=True)
+        self.W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        self.B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
+        self.D0 = Tensor(shape=m_shape + [n], dtype=dtype, name="d0", is_input=True)
+        self.D1 = Tensor(shape=m_shape + [n], dtype=dtype, name="d1", is_input=True)
+        self.X_pt = get_random_torch_tensor([*m_shape, k], dtype)
+        self.W_pt = get_random_torch_tensor([n, k], dtype)
+        self.B_pt = get_random_torch_tensor([n], dtype)
+        self.D0_pt = get_random_torch_tensor([*m_shape, n], dtype)
+        self.D1_pt = get_random_torch_tensor([*m_shape, n], dtype)
 
     def _test_and_verify(
-        self, module, numpy_output, has_d1=False, module_output_name="output_0"
+        self,
+        module,
+        torch_output,
+        dtype,
+        has_d1=False,
+        module_output_name="output_0",
     ):
         inputs = {
             "input_0": self.X_pt,
@@ -49,42 +67,62 @@ def _test_and_verify(
         }
         if has_d1:
             inputs["d1"] = self.D1_pt
-        y = torch.empty(list(numpy_output.shape)).cuda().half()
+        y = get_torch_empty_tensor(list(torch_output.shape), dtype)
         module.run_with_tensors(inputs, [y])
         if self.X_pt.nelement() == 0 or self.W_pt.nelement() == 0:
             pass
         else:
-            np.testing.assert_allclose(
-                numpy_output, y.cpu().numpy(), atol=1e-1, rtol=1e-1
-            )
-
-    def _test_bias_rcr_mul_add(self, m, m0, m1, k, n):
+            torch.testing.assert_close(torch_output, y, atol=1e-1, rtol=1e-1)
+
+    def _test_bias_rcr_mul_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_add()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(
-            Y, target, "./tmp", "gemm_rcr_bias_mul_add_k_{}_n_{}".format(k, n)
+            Y,
+            target,
+            "./tmp",
+            f"gemm_rcr_bias_mul_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
         Y_pt = (
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             * self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_mul_add(self):
         self._test_bias_rcr_mul_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
-
-    def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
+        self._test_bias_rcr_mul_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul_add(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_add_rocm(self):
+        self._test_bias_rcr_mul_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_sigmoid_mul(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -93,7 +131,7 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_sigmoid_mul_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_sigmoid_mul_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -102,18 +140,28 @@ def _test_bias_rcr_sigmoid_mul(self, m, m0, m1, k, n):
             )
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_sigmoid_mul(self):
         self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
-            self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
-
-    def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
+        self._test_bias_rcr_sigmoid_mul(None, 2, 32, 256, 128)
+        self._test_bias_rcr_sigmoid_mul(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_sigmoid_mul_rocm(self):
+        self._test_bias_rcr_sigmoid_mul(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_sigmoid_mul_tanh(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_sigmoid_mul_tanh()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -122,7 +170,7 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_sigmoid_mul_tanh_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_sigmoid_mul_tanh_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.tanh(
@@ -131,19 +179,31 @@ def _test_bias_rcr_sigmoid_mul_tanh(self, m, m0, m1, k, n):
             )
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_sigmoid_mul_tanh(self):
         self._test_bias_rcr_sigmoid_mul_tanh(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128)
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 2, 32, 256, 128)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_sigmoid_mul_tanh(None, 21, 5, 1024, 0)
 
-    def _test_bias_rcr_add(self, m, m0, m1, k, n):
+    def test_bias_rcr_sigmoid_mul_tanh_rocm(self):
+        self._test_bias_rcr_sigmoid_mul_tanh(
+            8, None, None, 8, 8, test_name_suffix="_rocm"
+        )
+
+    def _test_bias_rcr_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -152,25 +212,35 @@ def _test_bias_rcr_add(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             + self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_add(self):
         self._test_bias_rcr_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add(None, 21, 5, 1024, 512)
-
-    def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
+        self._test_bias_rcr_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_add_rocm(self):
+        self._test_bias_rcr_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_relu(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_relu()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -179,25 +249,35 @@ def _test_bias_rcr_add_relu(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_relu_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_relu_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.relu(
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             + self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_add_relu(self):
         self._test_bias_rcr_add_relu(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
-
-    def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
+        self._test_bias_rcr_add_relu(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_relu(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_add_relu_rocm(self):
+        self._test_bias_rcr_add_relu(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_add_relu(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add_relu()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
@@ -206,7 +286,7 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_add_relu_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_add_relu_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.relu(
@@ -214,25 +294,36 @@ def _test_bias_rcr_add_add_relu(self, m, m0, m1, k, n):
             + self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_add_add_relu(self):
-        target = detect_target()
         self._test_bias_rcr_add_add_relu(8, None, None, 8, 8)
-        if target.name() == "cuda":
-            self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 0)
-            # This test triggered a c10 assertion failure internally
-            # caffe2/c10/util/SmallVector.h:338:
-            # Assertion `idx < size()' failed
-            if type(target).__name__ != "FBCUDA":
-                self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
-
-    def _test_bias_rcr_mul(self, m, m0, m1, k, n):
+        self._test_bias_rcr_add_add_relu(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add_add_relu(None, 21, 5, 1024, 0)
+        # This test triggered a c10 assertion failure internally
+        # caffe2/c10/util/SmallVector.h:338:
+        # Assertion `idx < size()' failed
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        if type(target).__name__ != "FBCUDA":
+            self._test_bias_rcr_add_add_relu(21, None, None, 0, 512)
+
+    def test_bias_rcr_add_add_relu_rocm(self):
+        self._test_bias_rcr_add_add_relu(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_mul(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+        use_fp16_acc=False,
+    ):
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -241,25 +332,35 @@ def _test_bias_rcr_mul(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_mul_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_mul_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_mul(self):
         self._test_bias_rcr_mul(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
-
-    def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
+        self._test_bias_rcr_mul(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_rocm(self):
+        self._test_bias_rcr_mul(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_add_add(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_add_add()
         Y = OP(self.X, self.W, self.B, self.D0, self.D1)
         Y._attrs["name"] = "output_0"
@@ -268,7 +369,7 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_add_add_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_add_add_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = (
@@ -276,19 +377,29 @@ def _test_bias_rcr_add_add(self, m, m0, m1, k, n):
             + self.D0_pt
             + self.D1_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np, has_d1=True)
+        self._test_and_verify(module, Y_pt, dtype, has_d1=True)
 
     def test_bias_rcr_add_add(self):
         self._test_bias_rcr_add_add(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_add_add(None, 2, 32, 256, 128)
-            self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
-            self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
-
-    def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
+        self._test_bias_rcr_add_add(None, 2, 32, 256, 128)
+        self._test_bias_rcr_add_add(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_add_add(None, 0, 5, 1024, 512)
+
+    def test_bias_rcr_add_add_rocm(self):
+        self._test_bias_rcr_add_add(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    def _test_bias_rcr_mul_tanh(
+        self,
+        m,
+        m0,
+        m1,
+        k,
+        n,
+        dtype="float16",
+        test_name_suffix="",
+    ):
         target = detect_target()
-        self._init_tensors(m, k, n, m0, m1)
+        self._init_tensors(m, k, n, m0, m1, dtype)
         OP = ops.gemm_rcr_bias_mul_tanh()
         Y = OP(self.X, self.W, self.B, self.D0)
         Y._attrs["name"] = "output_0"
@@ -297,21 +408,170 @@ def _test_bias_rcr_mul_tanh(self, m, m0, m1, k, n):
             Y,
             target,
             "./tmp",
-            "gemm_rcr_bias_mul_tanh_k_{}_n_{}".format(k, n),
+            f"gemm_rcr_bias_mul_tanh_k_{k}_n_{n}_{dtype}{test_name_suffix}",
         )
 
         Y_pt = torch.tanh(
             torch.nn.functional.linear(self.X_pt, self.W_pt, bias=self.B_pt)
             * self.D0_pt
         )
-        Y_np = Y_pt.cpu().numpy()
-        self._test_and_verify(module, Y_np)
+        self._test_and_verify(module, Y_pt, dtype)
 
     def test_bias_rcr_mul_tanh(self):
         self._test_bias_rcr_mul_tanh(8, None, None, 8, 8)
-        if detect_target().name() == "cuda":
-            self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
-            self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
+        self._test_bias_rcr_mul_tanh(None, 2, 32, 256, 128)
+        self._test_bias_rcr_mul_tanh(None, 21, 5, 1024, 512)
+
+    def test_bias_rcr_mul_tanh_rocm(self):
+        self._test_bias_rcr_mul_tanh(8, None, None, 8, 8, test_name_suffix="_rocm")
+
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128, "float32"),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128, "float32"),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_float32_sm80(self, func, m, m0, m1, k, n, dtype):
+        func(
+            self,
+            m=m,
+            m0=m0,
+            m1=m1,
+            k=k,
+            n=n,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128, "bfloat16"),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128, "bfloat16"),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_bfloat16_bf16(self, func, m, m0, m1, k, n, dtype):
+        func(
+            self,
+            m=m,
+            m0=m0,
+            m1=m1,
+            k=k,
+            n=n,
+            dtype=dtype,
+        )
+
+    @parameterized.expand(
+        [
+            (_test_bias_rcr_mul_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_sigmoid_mul, None, 2, 32, 256, 128),
+            (_test_bias_rcr_sigmoid_mul_tanh, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_relu, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_add_relu, None, 2, 32, 256, 128),
+            (_test_bias_rcr_mul, None, 2, 32, 256, 128),
+            (_test_bias_rcr_add_add, None, 2, 32, 256, 128),
+            (_test_bias_rcr_mul_tanh, None, 2, 32, 256, 128),
+        ],
+        name_func=custom_name_func_with_funcname,
+    )
+    def test_gemm_bias_broadcast_sm90(self, func, m, m0, m1, k, n):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                func(
+                    self,
+                    m=m,
+                    m0=m0,
+                    m1=m1,
+                    k=k - 4,
+                    n=n,
+                    dtype="float16",
+                    test_name_suffix="_wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                func(
+                    self,
+                    m=m,
+                    m0=m0,
+                    m1=m1,
+                    k=k,
+                    n=n - 1,
+                    dtype="float16",
+                    test_name_suffix="_wrong_output_alignment_sm90",
+                )
+
+            func(
+                self,
+                m=m,
+                m0=m0,
+                m1=m1,
+                k=k,
+                n=n,
+                dtype="float16",
+                test_name_suffix="_force_sm90",
+            )
+            func(
+                self,
+                m=m,
+                m0=m0,
+                m1=m1,
+                k=k,
+                n=n,
+                dtype="bfloat16",
+                test_name_suffix="_force_sm90",
+            )
+
+    def test_gemm_bias_broadcast_use_fp16_acc_sm80(self):
+        self._test_bias_rcr_mul(
+            m=None,
+            m0=2,
+            m1=32,
+            k=256,
+            n=128,
+            dtype="float32",
+            test_name_suffix="_use_fp16_acc",
+            use_fp16_acc=True,
+        )
+        self._test_bias_rcr_mul(
+            m=None,
+            m0=2,
+            m1=32,
+            k=256,
+            n=128,
+            dtype="bfloat16",
+            test_name_suffix="_use_fp16_acc",
+            use_fp16_acc=True,
+        )
+
+
+filter_test_cases_by_test_env(GEMMBiasBroadcastTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_hardswish.py b/tests/unittest/ops/test_gemm_bias_hardswish.py
index c0e55201e..b6127666e 100644
--- a/tests/unittest/ops/test_gemm_bias_hardswish.py
+++ b/tests/unittest/ops/test_gemm_bias_hardswish.py
@@ -15,9 +15,23 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 def hard_swish(x):
@@ -26,30 +40,90 @@ def hard_swish(x):
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasHardSwishTestCase(unittest.TestCase):
-    def test_rcr(self):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasHardSwishTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_hardswish(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_hardswish()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_hardswish")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_hardswish_{test_suffix}_{self._test_id}"
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = hard_swish(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_hardswish_fp16(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="float16")
+
+    def test_gemm_rcr_bias_hardswish_fp32_sm80(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="float32")
+
+    def test_gemm_rcr_bias_hardswish_bf16(self):
+        self._test_gemm_rcr_bias_hardswish(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_hardswish_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_hardswish(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_hardswish(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_hardswish(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_hardswish(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(GEMMBiasHardSwishTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_permute.py b/tests/unittest/ops/test_gemm_bias_permute.py
index a98718d38..8bbfa24db 100644
--- a/tests/unittest/ops/test_gemm_bias_permute.py
+++ b/tests/unittest/ops/test_gemm_bias_permute.py
@@ -19,11 +19,19 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "cuda", "Not supported by CUDA.")
 class GEMMBiasPermuteTestCase(unittest.TestCase):
-    def _test_gemm_rcr_bias_permute_m2n3(self, copy_op=False):
+    def _test_gemm_rcr_bias_permute_m2n3(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_bias_permute_m2n3",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -34,35 +42,62 @@ def _test_gemm_rcr_bias_permute_m2n3(self, copy_op=False):
         K = 256
         shape = (M1, N0, N1)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_bias_permute(shape, layout="m2n3")
         if copy_op:
             OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m2n3")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        module = compile_model(Y, target, "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype)
 
         Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt, B_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m2n3(self):
-        self._test_gemm_rcr_bias_permute_m2n3()
-        self._test_gemm_rcr_bias_permute_m2n3(copy_op=True)
+    def test_gemm_rcr_bias_permute_m2n3_fp16_rocm(self):
+        self._test_gemm_rcr_bias_permute_m2n3(
+            test_name="gemm_rcr_bias_permute_m2n3_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_permute_m2n3(
+            copy_op=True,
+            test_name="gemm_rcr_bias_permute_m2n3_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
+    def _test_gemm_rcr_bias_permute_m3n2(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_bias_permute_m3n2",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 16
         M2 = 32
@@ -73,9 +108,24 @@ def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
         K = 256
         shape = (M1, M2, N0)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_bias_permute(shape, layout="m3n2")
         if copy_op:
             OP = ops.gemm_rcr_bias_permute(**OP._get_op_attributes())
@@ -83,24 +133,36 @@ def _test_gemm_rcr_bias_permute_m3n2(self, copy_op=False):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_permute_m3n2")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype)
         Y_l = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_r = Y_l.reshape(M0, M1, M2, N0, N1)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt, B_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_bias_permute_m3n2(self):
-        self._test_gemm_rcr_bias_permute_m3n2()
-        self._test_gemm_rcr_bias_permute_m3n2(copy_op=True)
+    def test_gemm_rcr_bias_permute_m3n2_fp16_rocm(self):
+        self._test_gemm_rcr_bias_permute_m3n2(
+            test_name="gemm_rcr_bias_permute_m3n2_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_permute_m3n2(
+            copy_op=True,
+            test_name="gemm_rcr_bias_permute_m3n2_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
+    def _test_gemm_rcr_permute_m2n3(
+        self,
+        copy_op=False,
+        test_name="gemm_rcr_permute_m2n3",
+        dtype="float16",
+    ):
         M0 = 4
         M1 = 256
         N0 = 4
@@ -111,8 +173,18 @@ def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
         K = 256
         shape = (M1, N0, N1)
         target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[M, K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.gemm_rcr_permute(shape, layout="m2n3")
         if copy_op:
             OP = ops.gemm_rcr_permute(**OP._get_op_attributes())
@@ -120,25 +192,32 @@ def _test_gemm_rcr_permute_m2n3(self, copy_op=False):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "gemm_rcr_permute_m2n3")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([M, K], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         Y_l = torch.nn.functional.linear(X_pt, W_pt)
         Y_r = Y_l.reshape(M0, M1, N0, N1, N2)
         Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
 
         inputs = [X_pt, W_pt]
-        y = torch.empty(Y_pt.shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    def test_gemm_rcr_permute_m2n3(self):
-        self._test_gemm_rcr_permute_m2n3()
-        self._test_gemm_rcr_permute_m2n3(copy_op=True)
+    def test_gemm_rcr_permute_m2n3_fp16_rocm(self):
+        self._test_gemm_rcr_permute_m2n3(
+            test_name="test_gemm_rcr_permute_m2n3_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_permute_m2n3(
+            copy_op=True,
+            test_name="test_gemm_rcr_permute_m2n3_fp16_copy_op",
+            dtype="float16",
+        )
 
     # ========== enable them after fix profiler =========
-    # def test_gemm_rcr_bias_relu(self):
+    # def test_gemm_rcr_bias_relu_rocm(self):
     #     M0 = 4
     #     M1 = 32
     #     M2 = 128
@@ -169,7 +248,7 @@ def test_gemm_rcr_permute_m2n3(self):
     #     module.run_with_tensors(inputs, [y])
     #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-    # def test_gemm_rrr_bias_relu(self):
+    # def test_gemm_rrr_bias_relu_rocm(self):
     #     M0 = 4
     #     M1 = 32
     #     M2 = 128
@@ -201,5 +280,8 @@ def test_gemm_rcr_permute_m2n3(self):
     #     self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
 
+filter_test_cases_by_test_env(GEMMBiasPermuteTestCase)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_relu.py b/tests/unittest/ops/test_gemm_bias_relu.py
index 8def037ea..c0d5efc43 100644
--- a/tests/unittest/ops/test_gemm_bias_relu.py
+++ b/tests/unittest/ops/test_gemm_bias_relu.py
@@ -15,62 +15,154 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
 
 
 class GEMMBiasReluTestCase(unittest.TestCase):
-    def test_gemm_rcr_bias_relu(self):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasReluTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_relu(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_relu()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_relu")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_relu_{test_suffix}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = torch.relu(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_relu_fp16(self):
+        self._test_gemm_rcr_bias_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_relu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_relu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_relu(dtype="float32")
 
-    def test_gemm_rcr_bias_add_relu(self):
+    def test_gemm_rcr_bias_relu_bf16(self):
+        self._test_gemm_rcr_bias_relu(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_relu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_relu(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_relu(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_relu(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_relu(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+    def _test_gemm_rcr_bias_add_relu(self, dtype="float16"):
         M = 128
         K = 1024
         N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
-        D = Tensor(shape=[M, N], dtype="float16", name="input_3", is_input=True)
+        tolerance_limits = _TOLERANCE_LIMITS[dtype]
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
+        D = Tensor(shape=[M, N], dtype=dtype, name="input_3", is_input=True)
         OP = ops.gemm_rcr_bias_add_relu()
         Y = OP(X, W, B, D)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_add_relu")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        D_pt = torch.randn(M, N).cuda().half()
+        test_name = f"gemm_rcr_bias_add_relu_{dtype}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
+        D_pt = get_random_torch_tensor([M, N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt) + D_pt
         Y_pt = torch.relu(Y_pt)
 
         inputs = [X_pt, W_pt, B_pt, D_pt]
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **tolerance_limits)
+
+    def test_gemm_rcr_bias_add_relu_fp16(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_add_relu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float16")
+
+    def test_gemm_rcr_bias_add_relu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="float32")
+
+    def test_gemm_rcr_bias_add_relu_bf16(self):
+        self._test_gemm_rcr_bias_add_relu(dtype="bfloat16")
+
+
+filter_test_cases_by_test_env(GEMMBiasReluTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_sigmoid.py b/tests/unittest/ops/test_gemm_bias_sigmoid.py
index 48f57b030..bc8a79f70 100644
--- a/tests/unittest/ops/test_gemm_bias_sigmoid.py
+++ b/tests/unittest/ops/test_gemm_bias_sigmoid.py
@@ -15,36 +15,114 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 class GEMMBiasSigmoidTestCase(unittest.TestCase):
-    def test_rcr(self):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasSigmoidTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_sigmoid(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_sigmoid()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_sigmoid")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_sigmoid_{test_suffix}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = torch.sigmoid(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_sigmoid_fp16(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float16")
+
+    def test_gemm_rcr_bias_sigmoid_fp16_rocm(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float16")
+
+    def test_gemm_rcr_bias_sigmoid_fp32_sm80(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="float32")
+
+    def test_gemm_rcr_bias_sigmoid_bf16(self):
+        self._test_gemm_rcr_bias_sigmoid(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_sigmoid_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_sigmoid(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_sigmoid(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_sigmoid(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_sigmoid(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(GEMMBiasSigmoidTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_softmax.py b/tests/unittest/ops/test_gemm_bias_softmax.py
index 62fd90727..1c34f297f 100644
--- a/tests/unittest/ops/test_gemm_bias_softmax.py
+++ b/tests/unittest/ops/test_gemm_bias_softmax.py
@@ -12,62 +12,108 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
 import unittest
 
-import numpy as np
 import torch
-from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skip("GEMM + Softmax is disabled for now")
-class GEMMTestCase(unittest.TestCase):
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class GEMMBiasSoftmaxTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_gemm_rcr_bias_softmax(
-        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_bias_softmax"
+        self,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="gemm_rcr_bias_softmax",
+        atol=1e-2,
+        rtol=1e-2,
+        assert_argmax=True,
     ):
-        target = detect_target()
-        if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
-            return
-
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_softmax()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
-        Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
-        Y_pt = torch.softmax(Y_pt, dim=1)
-        Y_np = Y_pt.cpu().numpy()
-
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-        inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        x_pt = get_random_torch_tensor([M, K], dtype)
+        w_pt = get_random_torch_tensor([N, K], dtype)
+        b_pt = get_random_torch_tensor([N], dtype)
+        y_pt = torch.nn.functional.linear(x_pt, w_pt, bias=b_pt)
+        y_pt = torch.softmax(y_pt, dim=1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt, "input_2": b_pt}
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
-        np.testing.assert_allclose(
-            np.argmax(Y_np, axis=1),
-            np.argmax(y.cpu().numpy(), axis=1),
-            atol=1e-1,
-            rtol=1e-1,
+        torch.testing.assert_close(y, y_pt, atol=atol, rtol=rtol)
+
+        if assert_argmax:
+            torch.testing.assert_close(
+                torch.argmax(y, axis=1),
+                torch.argmax(y_pt, axis=1),
+                atol=1e-1,
+                rtol=1e-1,
+            )
+
+    def test_gemm_rcr_bias_softmax_float16(self):
+        self._test_gemm_rcr_bias_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="gemm_rcr_bias_softmax_fp16_1",
         )
 
-    def test_gemm_bias_softmax(self):
-        self._test_gemm_rcr_bias_softmax(N=81)
+        if not detect_target().use_dummy_profiling_results():
+            # dummy workspace size (10240 bytes) is insufficient for
+            # these tests: run them only locally where profiler is
+            # executed and detects the necessary workspace size
+            self._test_gemm_rcr_bias_softmax(
+                M=1024,
+                K=512,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_bias_softmax_fp16_2",
+            )
+            self._test_gemm_rcr_bias_softmax(
+                M=2048,
+                K=1024,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_bias_softmax_fp16_3",
+                atol=3e-2,
+                rtol=3e-2,
+                assert_argmax=False,
+            )
+
+    def test_gemm_rcr_bias_softmax_float32_sm80(self):
+        self._test_gemm_rcr_bias_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="gemm_rcr_bias_softmax_fp32_1",
+        )
+
+
+filter_test_cases_by_test_env(GEMMBiasSoftmaxTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_bias_swish.py b/tests/unittest/ops/test_gemm_bias_swish.py
index bbffb1e3a..c51c76d78 100644
--- a/tests/unittest/ops/test_gemm_bias_swish.py
+++ b/tests/unittest/ops/test_gemm_bias_swish.py
@@ -15,41 +15,115 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
 
 
 def swish(x):
     return x * torch.sigmoid(x)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMBiasSwishTestCase(unittest.TestCase):
-    def test_rcr(self):
-        M = 128
-        K = 1024
-        N = 64
-        target = detect_target()
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasSwishTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_swish(
+        self,
+        M=128,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_swish()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rcr_bias_swish")
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.randn(N).cuda().half()
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_swish_{test_suffix}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+        X_pt = get_random_torch_tensor([M, K], dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype)
+        B_pt = get_random_torch_tensor([N], dtype)
         Y_pt = torch.nn.functional.linear(X_pt, W_pt, bias=B_pt)
         Y_pt = swish(Y_pt)
 
         inputs = {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}
-        y = torch.empty([M, N]).cuda().half()
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_swish_fp16(self):
+        self._test_gemm_rcr_bias_swish(dtype="float16")
+
+    def test_gemm_rcr_bias_swish_fp32_sm80(self):
+        self._test_gemm_rcr_bias_swish(dtype="float32")
+
+    def test_gemm_rcr_bias_swish_bf16(self):
+        self._test_gemm_rcr_bias_swish(dtype="bfloat16")
+
+    def test_gemm_rcr_bias_swish_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_swish(
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_swish(
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_swish(
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_swish(
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(GEMMBiasSwishTestCase)
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_bias_tanh.py b/tests/unittest/ops/test_gemm_bias_tanh.py
index 27ef27c0b..9e7cb59e1 100644
--- a/tests/unittest/ops/test_gemm_bias_tanh.py
+++ b/tests/unittest/ops/test_gemm_bias_tanh.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import unittest
 
 import torch
@@ -21,46 +20,139 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 3e-2, "rtol": 2e-2},
+    "bfloat16": {"atol": 2e-1, "rtol": 2e-1},
+}
+
+
 class GEMMBiasTanhTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name):
-        K = 1024
-        N = 64
-        target = detect_target()
+    def __init__(self, *args, **kwargs):
+        super(GEMMBiasTanhTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_gemm_rcr_bias_tanh(
+        self,
+        Ms,
+        K=1024,
+        N=64,
+        dtype="float16",
+        test_suffix=None,
+    ):
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
-        X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
-        )
+        X = Tensor(shape=[MDim, IntImm(K)], dtype=dtype, name="input_0", is_input=True)
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)], dtype=dtype, name="input_1", is_input=True
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
+        B = Tensor(shape=[IntImm(N)], dtype=dtype, name="input_2", is_input=True)
         OP = ops.gemm_rcr_bias_tanh()
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_bias_tanh_{test_name}")
+        if test_suffix is None:
+            test_suffix = dtype
+        test_name = f"gemm_rcr_bias_tanh_{test_suffix}_{self._test_id}"
+        self._test_id += 1
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
 
         for M in Ms:
-            logging.info(f"Testing {M=}")
-
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.tanh(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_tanh_fp16(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float16",
+            test_suffix="static_m_fp16",
+        )
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[1, 7, 64, 127],
+            dtype="float16",
+            test_suffix="dynamic_m_fp16",
+        )
+
+    def test_gemm_rcr_bias_tanh_fp16_rocm(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float16",
+            test_suffix="static_m_fp16",
+        )
+
+    def test_gemm_rcr_bias_tanh_fp32_sm80(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="float32",
+            test_suffix="static_m_fp32",
+        )
+
+    def test_gemm_rcr_bias_tanh_bf16(self):
+        self._test_gemm_rcr_bias_tanh(
+            Ms=[128],
+            dtype="bfloat16",
+            test_suffix="static_m_bf16",
+        )
+
+    def test_gemm_rcr_bias_tanh_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_tanh(
+                    Ms=[128],
+                    K=1020,
+                    dtype="float16",
+                    test_suffix="wrong_input_alignment_sm90",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_tanh(
+                    Ms=[128],
+                    N=63,
+                    dtype="float16",
+                    test_suffix="wrong_output_alignment_sm90",
+                )
+
+            self._test_gemm_rcr_bias_tanh(
+                Ms=[128],
+                dtype="float16",
+                test_suffix="float16_force_sm90",
+            )
+            self._test_gemm_rcr_bias_tanh(
+                Ms=[128],
+                dtype="bfloat16",
+                test_suffix="bfloat16_force_sm90",
+            )
+
 
-    def test_rcr(self):
-        self._test_rcr([128], "static")
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m")
+filter_test_cases_by_test_env(GEMMBiasTanhTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_permute.py b/tests/unittest/ops/test_gemm_permute.py
index e961c359f..0c9b96453 100644
--- a/tests/unittest/ops/test_gemm_permute.py
+++ b/tests/unittest/ops/test_gemm_permute.py
@@ -19,21 +19,47 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GEMMTestCase(unittest.TestCase):
-    def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class GEMMPermuteTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def __init__(self, *args, **kwargs):
+        super(GEMMPermuteTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_rcr(
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        has_bias=False,
+        copy_op=False,
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
         if has_bias:
             OP = ops.gemm_rcr_bias_permute(shape)
             if copy_op:
@@ -46,12 +72,13 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
             Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
             if has_bias:
                 Y_l = torch.nn.functional.linear(X_pt, W_pt, B_pt)
             else:
@@ -62,7 +89,7 @@ def _test_rcr(self, ms, k, n, shape, test_name, has_bias=False, copy_op=False):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -74,7 +101,7 @@ def test_rcr(self):
                     32,
                     96,
                     (5, 3, 2),
-                    "permute1",
+                    f"test_rcr_float16_{has_bias}_{copy_op}_1",
                     has_bias=has_bias,
                     copy_op=copy_op,
                 )
@@ -83,23 +110,32 @@ def test_rcr(self):
                     64,
                     256,
                     (8, 4, 4),
-                    "permute2",
+                    f"test_rcr_float16_{has_bias}_{copy_op}_2",
                     has_bias=has_bias,
                     copy_op=copy_op,
                 )
 
     def _test_rcr_0213(
-        self, ms, k, n, shape, test_name, has_bias=False, copy_op=False, layout="0213"
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        has_bias=False,
+        copy_op=False,
+        layout="0213",
+        dtype="float16",
     ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[n, k], dtype="float16", name="input_1", is_input=True)
-        B = Tensor(shape=[n], dtype="float16", name="input_2", is_input=True)
+        W = Tensor(shape=[n, k], dtype=dtype, name="input_1", is_input=True)
+        B = Tensor(shape=[n], dtype=dtype, name="input_2", is_input=True)
         if has_bias:
             OP = ops.gemm_rcr_bias_permute(shape, layout)
             if copy_op:
@@ -112,12 +148,13 @@ def _test_rcr_0213(
             Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_{test_name}")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(n, k).cuda().half()
-            B_pt = torch.randn(n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([n, k], dtype)
+            B_pt = get_random_torch_tensor([n], dtype)
 
             def torch_f(x, w, b, has_bias, shape):
                 if has_bias:
@@ -135,7 +172,7 @@ def torch_f(x, w, b, has_bias, shape):
             inputs = {"input_0": X_pt, "input_1": W_pt}
             if has_bias:
                 inputs["input_2"] = B_pt
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
@@ -153,7 +190,7 @@ def test_rcr_0213(self):
             256,
             4000000,
             [54, 1000000],
-            "permute_0213_1",
+            "test_rcr_0213_float16_1",
             has_bias=False,
             copy_op=False,
             layout="0213",
@@ -163,44 +200,133 @@ def test_rcr_0213(self):
             256,
             300000,
             [29, 100000],
-            "permute_0213_2",
+            "test_rcr_0213_float16_2",
             has_bias=False,
             copy_op=False,
             layout="0213",
         )
 
-    def _test_rrr(self, ms, k, n, shape, test_name, copy_op=False):
+    def _test_rrr(
+        self,
+        ms,
+        k,
+        n,
+        shape,
+        test_name,
+        copy_op=False,
+        dtype="float16",
+    ):
         target = detect_target()
         X = Tensor(
             shape=[shape_utils.gen_int_var_min_max(ms), k],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
-        W = Tensor(shape=[k, n], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(shape=[k, n], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr_permute(shape)
         if copy_op:
             OP = ops.gemm_rrr_permute(**OP._get_op_attributes())
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_{}".format(test_name))
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for m in ms:
-            X_pt = torch.randn(m, k).cuda().half()
-            W_pt = torch.randn(k, n).cuda().half()
+            X_pt = get_random_torch_tensor([m, k], dtype)
+            W_pt = get_random_torch_tensor([k, n], dtype)
             Y_l = torch.matmul(X_pt, W_pt)
             Y_r = Y_l.reshape(16, *shape, 16)
             Y_pt = torch.permute(Y_r, [2, 0, 3, 1, 4])
             inputs = {"input_0": X_pt, "input_1": W_pt}
-            y = torch.empty(Y_pt.shape).cuda().half()
+            y = get_torch_empty_tensor(Y_pt.shape, dtype)
             module.run_with_tensors(inputs, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
     def test_rrr(self):
-        self._test_rrr([80], 32, 96, (5, 3, 2), "permute1")
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2")
-        self._test_rrr([128], 64, 256, (8, 4, 4), "permute2_copy_op", copy_op=True)
+        self._test_rrr(
+            [80],
+            32,
+            96,
+            (5, 3, 2),
+            "test_rrr_float16_1",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_float16_2",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_float16_2_copy_op",
+            copy_op=True,
+        )
+
+    def test_permute_float32(self):
+        for has_bias in (True, False):
+            self._test_rcr(
+                [80],
+                32,
+                96,
+                (5, 3, 2),
+                f"test_rcr_float32_{has_bias}",
+                has_bias=has_bias,
+                dtype="float32",
+            )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "test_rcr_0213_float32",
+            has_bias=False,
+            layout="0213",
+            dtype="float32",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_float32",
+            dtype="float32",
+        )
+
+    def test_gemm_permute_bfloat16(self):
+        for has_bias in (True, False):
+            self._test_rcr(
+                [80],
+                32,
+                96,
+                (5, 3, 2),
+                f"test_rcr_bfloat16_{has_bias}",
+                has_bias=has_bias,
+                dtype="bfloat16",
+            )
+        self._test_rcr_0213(
+            [29, 29 * 8],
+            256,
+            300000,
+            [29, 100000],
+            "test_rcr_0213_bfloat16",
+            has_bias=False,
+            layout="0213",
+            dtype="bfloat16",
+        )
+        self._test_rrr(
+            [128],
+            64,
+            256,
+            (8, 4, 4),
+            "test_rrr_bfloat16",
+            dtype="bfloat16",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_profiler_cache.py b/tests/unittest/ops/test_gemm_profiler_cache.py
new file mode 100644
index 000000000..eec488bf8
--- /dev/null
+++ b/tests/unittest/ops/test_gemm_profiler_cache.py
@@ -0,0 +1,223 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import logging
+import tempfile
+import unittest
+from unittest.mock import patch
+
+from aitemplate.backend.profiler_cache import ProfileCacheDB
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntImm, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import env_variables
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+class GemmProfilerCacheTestCase(unittest.TestCase):
+    def _test(
+        self,
+        first_dim,
+        logger,
+        test_name="gemm_rcr",
+        k=128,
+        n=8,
+    ):
+        target = detect_target()
+
+        X = Tensor(
+            shape=[first_dim, k],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[n, k],
+            dtype="float16",
+            name="input_1",
+            is_input=True,
+        )
+        OP = ops.gemm_rcr()
+        Y = OP(X, W)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+
+        with self.assertLogs(
+            logger=logger,
+            level="INFO",
+        ) as logs:
+            compile_model(
+                Y,
+                target,
+                "./tmp",
+                test_name,
+            )
+
+        return "\n".join(logs.output)
+
+    def _run_test(
+        self,
+        first_dim,
+        test_name,
+        logger,
+        cache_dir,
+    ):
+        with env_variables(
+            TRICK_CI_ENV="1",
+            CACHE_DIR=f"{cache_dir}/{test_name}",
+        ):
+            return self._test(
+                first_dim=first_dim,
+                logger=logger,
+                test_name=test_name,
+            )
+
+    def test_gemm_profiler_cache(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_cache"
+        logger = "aitemplate.compiler.transform.profile"
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            run1_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 1 profilers", run1_logs)
+
+            run2_logs = self._run_test(
+                first_dim=first_dim,
+                test_name=test_name,
+                logger=logger,
+                cache_dir=tmp_dirname,
+            )
+            self.assertIn("generated 0 profilers", run2_logs)
+
+    def test_gemm_profiler_cache_versioning(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_cache_versioning"
+        logger = "aitemplate.backend.profiler_cache"
+        cache_version_property = "gemm_cache_version"
+        target_name = detect_target().name()
+
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                run1_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_1' does not exist in the db",
+                    run1_before_version_change_logs,
+                )
+
+                run2_before_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_1' exists in the db",
+                    run2_before_version_change_logs,
+                )
+
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=2,  # version
+            ):
+                run1_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_2' does not exist in the db",
+                    run1_after_version_change_logs,
+                )
+
+                run2_after_version_change_logs = self._run_test(
+                    first_dim=first_dim,
+                    test_name=test_name,
+                    logger=logger,
+                    cache_dir=tmp_dirname,
+                )
+                self.assertIn(
+                    f"table_name='{target_name}_gemm_2' exists in the db",
+                    run2_after_version_change_logs,
+                )
+
+    def test_gemm_profiler_force_cache(self):
+        first_dim = IntImm(4)
+        test_name = "gemm_rcr_profiler_force_cache"
+        cache_version_property = "gemm_cache_version"
+
+        logger = "aitemplate.backend.profiler_cache"
+        _LOGGER.info(f"running {test_name=}")
+        with tempfile.TemporaryDirectory() as tmp_dirname:
+            _LOGGER.info(f"{tmp_dirname=}")
+            with patch.object(
+                target=ProfileCacheDB,
+                attribute=cache_version_property,
+                new=1,  # version
+            ):
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    with self.assertRaisesRegex(
+                        RuntimeError, "force_cache is enabled but we could not find"
+                    ):
+                        self._run_test(
+                            first_dim=first_dim,
+                            test_name=test_name,
+                            logger=logger,
+                            cache_dir=tmp_dirname,
+                        )
+
+                _LOGGER.info("make cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE=None):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+                _LOGGER.info("force cache with no cache 1")
+                with env_variables(AIT_FORCE_PROFILER_CACHE="1"):
+                    self._run_test(
+                        first_dim=first_dim,
+                        test_name=test_name,
+                        logger=logger,
+                        cache_dir=tmp_dirname,
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
index 6fcbf4f31..e0592b725 100644
--- a/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_bias_fast_gelu.py
@@ -21,22 +21,55 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 class GEMMRcrBiasFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
-        K = 1024
-        N = 64
-        target = detect_target()
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_gemm_rcr_bias_fast_gelu(
+        self,
+        Ms,
+        test_name,
+        K=1024,
+        N=64,
+        use_fast_gelu=True,
+        dtype="float16",
+    ):
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+            shape=[MDim, IntImm(K)],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
         )
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[IntImm(N)],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[IntImm(N)], dtype="float16", name="input_2", is_input=True)
         OP = (
             ops.gemm_rcr_bias_fast_gelu() if use_fast_gelu else ops.gemm_rcr_bias_gelu()
         )
@@ -46,7 +79,7 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
 
         module = compile_model(
             Y,
-            target,
+            detect_target(),
             "./tmp",
             f"gemm_rcr_bias_fast_gelu_{test_name}"
             if use_fast_gelu
@@ -56,23 +89,138 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
         for M in Ms:
             logging.info(f"Testing {M=}")
 
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
-            B_pt = torch.randn(N).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
+            B_pt = get_random_torch_tensor([N], dtype)
             Y_pt = torch.nn.GELU()(torch.nn.functional.linear(X_pt, W_pt, bias=B_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
-
-    def test_rcr(self):
-        self._test_rcr([128], "static", use_fast_gelu=True)
-        self._test_rcr([128], "static", use_fast_gelu=False)
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_bias_fast_gelu_fp16(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_gelu",
+            use_fast_gelu=False,
+            dtype="float16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16_gelu",
+            use_fast_gelu=False,
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_fp16_rocm(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_rocm_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_fp32_sm80(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32_fast_gelu",
+            use_fast_gelu=True,
+            dtype="float32",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32_gelu",
+            use_fast_gelu=False,
+            dtype="float32",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_bf16(self):
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16_fast_gelu",
+            use_fast_gelu=True,
+            dtype="bfloat16",
+        )
+        self._test_gemm_rcr_bias_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16_gelu",
+            use_fast_gelu=False,
+            dtype="bfloat16",
+        )
+
+    def test_gemm_rcr_bias_fast_gelu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_bias_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    K=1020,
+                    test_name="wrong_input_alignment_sm90",
+                    use_fast_gelu=True,
+                    dtype="float16",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_bias_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    N=63,
+                    test_name="wrong_output_alignment_sm90",
+                    use_fast_gelu=True,
+                    dtype="float16",
+                )
+
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_fast_gelu_force_sm90",
+                use_fast_gelu=True,
+                dtype="float16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_gelu_force_sm90",
+                use_fast_gelu=False,
+                dtype="float16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_fast_gelu_force_sm90",
+                use_fast_gelu=True,
+                dtype="bfloat16",
+            )
+            self._test_gemm_rcr_bias_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_gelu_force_sm90",
+                use_fast_gelu=False,
+                dtype="bfloat16",
+            )
+
+
+filter_test_cases_by_test_env(GEMMRcrBiasFastGeluTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
index a95397f60..c3d51ffb5 100644
--- a/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
+++ b/tests/unittest/ops/test_gemm_rcr_fast_gelu.py
@@ -22,9 +22,22 @@
 from aitemplate.compiler.base import IntImm
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 from aitemplate.utils import shape_utils
 
 
+_TOLERANCE_LIMITS = {
+    "float16": {"atol": 1e-1, "rtol": 1e-1},
+    "float32": {"atol": 1e-1, "rtol": 1e-1},
+    "bfloat16": {"atol": 3e-1, "rtol": 3e-1},
+}
+
+
 class NewGELUActivation(torch.nn.Module):
     def __init__(
         self,
@@ -46,16 +59,30 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
 
 class GEMMRcrFastGeluTestCase(unittest.TestCase):
-    def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
-        K = 1024
-        N = 64
-        target = detect_target()
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(10)
+
+    def _test_gemm_rcr_fast_gelu(
+        self,
+        Ms,
+        test_name,
+        K=1024,
+        N=64,
+        dtype="float16",
+    ):
         MDim = shape_utils.gen_int_var_min_max(Ms, name="m")
         X = Tensor(
-            shape=[MDim, IntImm(K)], dtype="float16", name="input_0", is_input=True
+            shape=[MDim, IntImm(K)],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
         )
         W = Tensor(
-            shape=[IntImm(N), IntImm(K)], dtype="float16", name="input_1", is_input=True
+            shape=[IntImm(N), IntImm(K)],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
 
         OP = ops.gemm_rcr_fast_gelu()
@@ -64,27 +91,112 @@ def _test_rcr(self, Ms, test_name, use_fast_gelu=True):
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", f"gemm_rcr_fast_gelu_{test_name}")
+        module = compile_model(
+            Y,
+            detect_target(),
+            "./tmp",
+            f"gemm_rcr_fast_gelu_{test_name}",
+        )
 
         for M in Ms:
             logging.info(f"Testing {M=}")
 
-            X_pt = torch.randn(M, K).cuda().half()
-            W_pt = torch.randn(N, K).cuda().half()
+            X_pt = get_random_torch_tensor([M, K], dtype)
+            W_pt = get_random_torch_tensor([N, K], dtype)
             Y_pt = NewGELUActivation()(torch.nn.functional.linear(X_pt, W_pt))
-            y = torch.empty([M, N]).cuda().half()
+            y = get_torch_empty_tensor([M, N], dtype)
             module.run_with_tensors(
                 {"input_0": X_pt, "input_1": W_pt},
                 [y],
             )
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
-
-    def test_rcr(self):
-        self._test_rcr([128], "static", use_fast_gelu=True)
-        if detect_target().name() == "cuda":
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=True)
-            self._test_rcr([128], "static", use_fast_gelu=False)
-            self._test_rcr([1, 7, 64, 127], "dynamic_m", use_fast_gelu=False)
+            torch.testing.assert_close(Y_pt, y, **_TOLERANCE_LIMITS[dtype])
+
+    def test_gemm_rcr_fast_gelu_fp16(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16",
+            dtype="float16",
+        )
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp16",
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_fast_gelu_fp16_rocm(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp16_rocm",
+            dtype="float16",
+        )
+
+    def test_gemm_rcr_fast_gelu_fp32_sm80(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_fp32",
+            dtype="float32",
+        )
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_fp32",
+            dtype="float32",
+        )
+
+    def test_gemm_rcr_fast_gelu_bf16(self):
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[128],
+            test_name="static_bf16",
+            dtype="bfloat16",
+        )
+        self._test_gemm_rcr_fast_gelu(
+            Ms=[1, 7, 64, 127],
+            test_name="dynamic_m_bf16",
+            dtype="bfloat16",
+        )
+
+    def test_gemm_rcr_fast_gelu_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # input alignment < 8 not supported by SM90 kernels
+                # use alignment 4 to avoid auto-padding to 8
+                self._test_gemm_rcr_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    K=1020,
+                    test_name="wrong_input_alignment_sm90",
+                    dtype="float16",
+                )
+
+            with self.assertRaisesRegex(
+                expected_exception=RuntimeError,
+                expected_regex="No GEMM op instances are left after filtering",
+            ):
+                # output alignment < 8 not supported by SM90 TMA epilogues
+                self._test_gemm_rcr_fast_gelu(
+                    Ms=[1, 7, 64, 127],
+                    N=63,
+                    test_name="wrong_output_alignment_sm90",
+                    dtype="float16",
+                )
+
+            self._test_gemm_rcr_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_fp16_force_sm90",
+                dtype="float16",
+            )
+            self._test_gemm_rcr_fast_gelu(
+                Ms=[1, 7, 64, 127],
+                test_name="dynamic_m_bf16_force_sm90",
+                dtype="bfloat16",
+            )
+
+
+filter_test_cases_by_test_env(GEMMRcrFastGeluTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_gemm_rrr_small_nk.py b/tests/unittest/ops/test_gemm_rrr_small_nk.py
index b8279891f..d3ef0f0c8 100644
--- a/tests/unittest/ops/test_gemm_rrr_small_nk.py
+++ b/tests/unittest/ops/test_gemm_rrr_small_nk.py
@@ -20,30 +20,51 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
+from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMRrrSmallNKTestCase(unittest.TestCase):
-    def _test_rrr(self, M, N, K, use_fp16_acc=True):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _test_rrr(
+        self, M, N, K, use_fp16_acc=True, dtype="float16", atol=1e-1, rtol=1e-1
+    ):
         target = detect_target(use_fp16_acc=use_fp16_acc)
-        X = Tensor(shape=[*M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[shape_utils.gen_int_var_min_max(M, name="batch_dim"), K],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(shape=[K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rrr_small_nk()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "gemm_rrr_small_nk")
-        X_pt = torch.randn(*M, K).cuda().half()
-        W_pt = torch.randn(K, N).cuda().half()
-        Y_pt = torch.matmul(X_pt, W_pt)
+        module = compile_model(
+            Y, target, "./tmp", f"gemm_rrr_small_nk_{self.test_count}"
+        )
+
+        for m in M:
+            X_pt = get_random_torch_tensor([m, K], dtype)
+            W_pt = get_random_torch_tensor([K, N], dtype)
+            Y_pt = torch.matmul(X_pt, W_pt)
 
-        inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([*M, N]).cuda().half()
-        module.run_with_tensors(inputs, [y])
-        if X_pt.nelement() == 0 or W_pt.nelement() == 0:
-            pass
-        else:
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+            inputs = {"input_0": X_pt, "input_1": W_pt}
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+            if X_pt.nelement() == 0 or W_pt.nelement() == 0:
+                pass
+            else:
+                torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
+        self.test_count += 1
 
         # from aitemplate.testing.benchmark_pt import benchmark_torch_function
         # t = benchmark_torch_function(100, torch.matmul, X_pt, W_pt)
@@ -63,6 +84,16 @@ def test_rrr(self):
         # self._test_rrr([1000000], 8, 16)
         # self._test_rrr([1000000], 6, 3, False)
 
+    def test_gemm_rrr_small_nk_float_sm80(self):
+        self._test_rrr([0, 1], 6, 3, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
+        self._test_rrr([100001], 7, 10, False, dtype="float32", atol=1e-5, rtol=1.3e-6)
+
+    def test_gemm_rrr_small_nk_bfloat16_sm80(self):
+        self._test_rrr([0, 1], 6, 3, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+        self._test_rrr([100001], 7, 10, False, dtype="bfloat16", atol=1e-1, rtol=1e-1)
+
+
+filter_test_cases_by_test_env(GEMMRrrSmallNKTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_gemm_softmax.py b/tests/unittest/ops/test_gemm_softmax.py
index 5bf34a3f5..d9f28f1f9 100644
--- a/tests/unittest/ops/test_gemm_softmax.py
+++ b/tests/unittest/ops/test_gemm_softmax.py
@@ -12,60 +12,99 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import os
 import unittest
 
-import numpy as np
 import torch
-from aitemplate.compiler import compile_model, Model, ops
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-@unittest.skip("GEMM + Softmax is disabled for now")
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GEMMSoftmaxTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
     def _test_gemm_rcr_softmax(
-        self, M=16, K=64, N=24, rebuild=True, test_name="gemm_softmax"
+        self,
+        M=16,
+        K=64,
+        N=24,
+        dtype="float16",
+        test_name="gemm_rcr_softmax",
     ):
-        target = detect_target()
-        if type(target).__name__ == "FBCUDA":
-            logger.warning(__file__, "Skip this test for special profiling requirement")
-            return
-
-        X = Tensor(shape=[M, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.gemm_rcr_softmax()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        X_pt = torch.randn(M, K).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        Y_pt = torch.nn.functional.linear(X_pt, W_pt)
-        Y_pt = torch.softmax(Y_pt, dim=1)
-        Y_np = Y_pt.cpu().numpy()
-
-        if rebuild:
-            target = detect_target()
-            module = compile_model(Y, target, "./tmp", test_name)
-        else:
-            module = Model(os.path.join("./tmp", test_name, "test.so"))
-        inputs = {"input_0": X_pt, "input_1": W_pt}
-        y = torch.empty([M, N]).cuda().half()
+        x_pt = get_random_torch_tensor([M, K], dtype)
+        w_pt = get_random_torch_tensor([N, K], dtype)
+        y_pt = torch.nn.functional.linear(x_pt, w_pt)
+        y_pt = torch.softmax(y_pt, dim=1)
+
+        module = compile_model(Y, detect_target(), "./tmp", test_name)
+
+        inputs = {"input_0": x_pt, "input_1": w_pt}
+        y = get_torch_empty_tensor([M, N], dtype)
         module.run_with_tensors(inputs, [y])
-        y_ait_np = y.cpu().numpy()
-        np.testing.assert_allclose(Y_np, y_ait_np, atol=1e-1, rtol=1e-1)
-        np.testing.assert_allclose(
-            np.argmax(Y_np, axis=1),
-            np.argmax(y_ait_np, axis=1),
+
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+        torch.testing.assert_close(
+            torch.argmax(y, axis=1),
+            torch.argmax(y_pt, axis=1),
             atol=1e-1,
             rtol=1e-1,
         )
 
-    def test_gemm_softmax(self):
-        self._test_gemm_rcr_softmax()
+    def test_gemm_rcr_softmax_float16(self):
+        self._test_gemm_rcr_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float16",
+            test_name="gemm_rcr_softmax_fp16_1",
+        )
+
+        if not detect_target().use_dummy_profiling_results():
+            # dummy workspace size (10240 bytes) is insufficient for
+            # these tests: run them only locally where profiler is
+            # executed and detects the necessary workspace size
+            self._test_gemm_rcr_softmax(
+                M=1024,
+                K=512,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_softmax_fp16_2",
+            )
+            self._test_gemm_rcr_softmax(
+                M=2048,
+                K=1024,
+                N=4096,
+                dtype="float16",
+                test_name="gemm_rcr_softmax_fp16_3",
+            )
+
+    def test_gemm_rcr_softmax_float32_sm80(self):
+        self._test_gemm_rcr_softmax(
+            M=16,
+            K=64,
+            N=24,
+            dtype="float32",
+            test_name="gemm_rcr_softmax_fp32_1",
+        )
+
+
+filter_test_cases_by_test_env(GEMMSoftmaxTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_group_gemm_rcr.py b/tests/unittest/ops/test_group_gemm_rcr.py
index 7b07fcbc3..9ff58d9b9 100644
--- a/tests/unittest/ops/test_group_gemm_rcr.py
+++ b/tests/unittest/ops/test_group_gemm_rcr.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,19 +20,27 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 from parameterized import param, parameterized
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrTestCase(unittest.TestCase):
     @parameterized.expand(
         [
-            param(False, "group_gemm_rcr_run_once"),
-            param(True, "group_gemm_rcr_run_twice"),
+            param(False, "group_gemm_rcr_run_once", "float16"),
+            param(True, "group_gemm_rcr_run_twice", "float16"),
+            param(False, "group_gemm_rcr_run_once_fp32", "float32"),
+            param(False, "group_gemm_rcr_run_once_bf16", "bfloat16"),
         ]
     )
-    def test_rcr(self, run_twice: bool, test_name: str):
+    def test_group_gemm_rcr(self, run_twice: bool, test_name: str, dtype: str):
         M = 256
         K1 = 128
         N1 = 60
@@ -39,12 +48,12 @@ def test_rcr(self, run_twice: bool, test_name: str):
         N2 = 64
         target = detect_target()
         if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
+            _LOGGER.warning("Group Gemm need SM80 HW")
             return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         OP = ops.group_gemm_rcr()
         Y1, Y2 = OP(operand_groups=[[X1, W1], [X2, W2]])
         Y1._attrs["name"] = "y1"
@@ -61,10 +70,10 @@ def test_rcr(self, run_twice: bool, test_name: str):
             graph_outputs.append(Y3)
 
         module = compile_model(graph_outputs, target, "./tmp", test_name)
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
 
@@ -74,18 +83,20 @@ def test_rcr(self, run_twice: bool, test_name: str):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         outputs = {"y1": y1, "y2": y2}
         if run_twice:
-            outputs["y3"] = torch.empty([M, N1]).cuda().half()
+            outputs["y3"] = torch.empty_like(y1)
 
         module.run_with_tensors(inputs, outputs)
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
         if run_twice:
-            self.assertTrue(torch.allclose(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1))
+            torch.testing.assert_close(Y1_pt, outputs["y3"], atol=1e-1, rtol=1e-1)
+
 
+filter_test_cases_by_test_env(GroupGEMMRcrTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias.py b/tests/unittest/ops/test_group_gemm_rcr_bias.py
index 159f3b0ff..64f3fcb57 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias.py
@@ -20,41 +20,52 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
-logger = logging.getLogger(__name__)
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrBiasTestCase(unittest.TestCase):
-    def test_rcr(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_fp16", "float16"),
+            param("group_gemm_rcr_bias_fp32_sm80", "float32"),
+            param("group_gemm_rcr_bias_bf16", "bfloat16"),
+        ]
+    )
+    def test_group_gemm_rcr_bias(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning("Group Gemm need SM80 HW")
-            return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
         OP = ops.group_gemm_rcr_bias()
         Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
         Y1._attrs["name"] = "y1"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "y2"
         Y2._attrs["is_output"] = True
-        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y1, Y2], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
 
@@ -66,12 +77,14 @@ def test_rcr(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
+
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
index a533410a6..6b4e69f13 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_activation.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
+import logging
 import unittest
 
 import torch
@@ -19,44 +20,63 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrBiasActTestCase(unittest.TestCase):
-    def test_rcr_relu(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_relu_fp16", "float16", "relu"),
+            param("group_gemm_rcr_bias_relu_fp32_sm80", "float32", "relu"),
+            param("group_gemm_rcr_bias_relu_bf16", "bfloat16", "relu"),
+            param("group_gemm_rcr_bias_sigmoid_fp16", "float16", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_fp32_sm80", "float32", "sigmoid"),
+            param("group_gemm_rcr_bias_sigmoid_bf16", "bfloat16", "sigmoid"),
+        ]
+    )
+    def test_rcr_activation(self, test_name, dtype, activation):
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
-            return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
-        OP = ops.group_gemm_rcr_bias_relu()
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
+        OP = (
+            ops.group_gemm_rcr_bias_relu()
+            if activation == "relu"
+            else ops.group_gemm_rcr_bias_sigmoid()
+        )
+        act_pt = torch.relu if activation == "relu" else torch.sigmoid
         Y1, Y2 = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]])
         Y1._attrs["name"] = "y1"
         Y1._attrs["is_output"] = True
         Y2._attrs["name"] = "y2"
         Y2._attrs["is_output"] = True
-        module = compile_model([Y1, Y2], target, "./tmp", "group_gemm_rcr_bias_relu")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y1, Y2], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
-        Y1_pt = torch.relu(Y1_pt)
+        Y1_pt = act_pt(Y1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
-        Y2_pt = torch.relu(Y2_pt)
+        Y2_pt = act_pt(Y2_pt)
 
         inputs = {
             "x1": X1_pt,
@@ -66,12 +86,14 @@ def test_rcr_relu(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y1 = torch.empty([M, N1]).cuda().half()
-        y2 = torch.empty([M, N2]).cuda().half()
+        y1 = torch.empty_like(Y1_pt)
+        y2 = torch.empty_like(Y2_pt)
         module.run_with_tensors(inputs, {"y1": y1, "y2": y2})
-        self.assertTrue(torch.allclose(Y1_pt, y1, atol=1e-1, rtol=1e-1))
-        self.assertTrue(torch.allclose(Y2_pt, y2, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y1_pt, y1, atol=1e-1, rtol=1e-1)
+        torch.testing.assert_close(Y2_pt, y2, atol=1e-1, rtol=1e-1)
+
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasActTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
index bf8af1bb4..3752c3adc 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_bias_cat.py
@@ -15,52 +15,61 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class GroupGEMMRcrCatTestCase(unittest.TestCase):
-    def test_rcr_bias_cat(self):
+class GroupGEMMRcrBiasCatTestCase(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_bias_cat_fp16", "float16"),
+            param("group_gemm_rcr_bias_cat_fp32_sm80", "float32"),
+            param("group_gemm_rcr_bias_cat_bf16", "bfloat16"),
+        ]
+    )
+    def test_group_gemm_rcr_bias_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
-            return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
-        B1 = Tensor(shape=[N1], dtype="float16", name="b1", is_input=True)
-        B2 = Tensor(shape=[N2], dtype="float16", name="b2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
+        B1 = Tensor(shape=[N1], dtype=dtype, name="b1", is_input=True)
+        B2 = Tensor(shape=[N2], dtype=dtype, name="b2", is_input=True)
         OP = ops.group_gemm_rcr_bias()
         Y = OP(operand_groups=[[X1, W1, B1], [X2, W2, B2]], output_stride_dim=1)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
-        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_bias_cat")
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
-        B1_pt = torch.randn(N1).cuda().half()
-        B2_pt = torch.randn(N2).cuda().half()
+        module = compile_model([Y], target, "./tmp", test_name)
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
+        B1_pt = get_random_torch_tensor(shape=(N1,), dtype=dtype)
+        B2_pt = get_random_torch_tensor(shape=(N2,), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt, bias=B1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt, bias=B2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,
@@ -70,10 +79,12 @@ def test_rcr_bias_cat(self):
             "w2": W2_pt,
             "b2": B2_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
 
+filter_test_cases_by_test_env(GroupGEMMRcrBiasCatTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_group_gemm_rcr_cat.py b/tests/unittest/ops/test_group_gemm_rcr_cat.py
index cb4ff4986..4b3646e94 100644
--- a/tests/unittest/ops/test_group_gemm_rcr_cat.py
+++ b/tests/unittest/ops/test_group_gemm_rcr_cat.py
@@ -15,49 +15,58 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
+from parameterized import param, parameterized
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class GroupGEMMRcrCatTestCase(unittest.TestCase):
-    def test_rcr_cat(self):
+    @parameterized.expand(
+        [
+            param("group_gemm_rcr_cat_fp16", "float16"),
+            param("group_gemm_rcr_cat_fp32_sm80", "float32"),
+            param("group_gemm_rcr_cat_bf16", "bfloat16"),
+        ]
+    )
+    def test_group_gemm_rcr_cat(self, test_name, dtype):
         M = 256
         K1 = 128
         N1 = 60
         K2 = 192
         N2 = 64
         target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
-            return
-        X1 = Tensor(shape=[M, K1], dtype="float16", name="x1", is_input=True)
-        X2 = Tensor(shape=[M, K2], dtype="float16", name="x2", is_input=True)
-        W1 = Tensor(shape=[N1, K1], dtype="float16", name="w1", is_input=True)
-        W2 = Tensor(shape=[N2, K2], dtype="float16", name="w2", is_input=True)
+        X1 = Tensor(shape=[M, K1], dtype=dtype, name="x1", is_input=True)
+        X2 = Tensor(shape=[M, K2], dtype=dtype, name="x2", is_input=True)
+        W1 = Tensor(shape=[N1, K1], dtype=dtype, name="w1", is_input=True)
+        W2 = Tensor(shape=[N2, K2], dtype=dtype, name="w2", is_input=True)
         OP = ops.group_gemm_rcr()
         Y = OP(operand_groups=[[X1, W1], [X2, W2]], output_stride_dim=1)
         Y._attrs["name"] = "y"
         Y._attrs["is_output"] = True
-        module = compile_model([Y], target, "./tmp", "group_gemm_rcr_cat")
+        module = compile_model([Y], target, "./tmp", test_name)
 
-        X1_pt = torch.randn(M, K1).cuda().half()
-        X2_pt = torch.randn(M, K2).cuda().half()
-        W1_pt = torch.randn(N1, K1).cuda().half()
-        W2_pt = torch.randn(N2, K2).cuda().half()
+        X1_pt = get_random_torch_tensor(shape=(M, K1), dtype=dtype)
+        X2_pt = get_random_torch_tensor(shape=(M, K2), dtype=dtype)
+        W1_pt = get_random_torch_tensor(shape=(N1, K1), dtype=dtype)
+        W2_pt = get_random_torch_tensor(shape=(N2, K2), dtype=dtype)
         Y1_pt = torch.nn.functional.linear(X1_pt, W1_pt)
         Y2_pt = torch.nn.functional.linear(X2_pt, W2_pt)
         Y_pt = torch.cat([Y1_pt, Y2_pt], dim=1)
-        Y_np = Y_pt.cpu().numpy()
 
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
-        logging.info("AITemplate y_shape: {}".format(y_shape))
-        np.testing.assert_equal(y_shape, Y_np.shape)
+        _LOGGER.info("AITemplate y_shape: {}".format(y_shape))
+        torch.testing.assert_close(y_shape, list(Y_pt.shape))
 
         inputs = {
             "x1": X1_pt,
@@ -65,10 +74,12 @@ def test_rcr_cat(self):
             "x2": X2_pt,
             "w2": W2_pt,
         }
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(inputs, [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
 
+filter_test_cases_by_test_env(GroupGEMMRcrCatTestCase)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_grouped_b2b_bmm.py b/tests/unittest/ops/test_grouped_b2b_bmm.py
new file mode 100644
index 000000000..f552e5937
--- /dev/null
+++ b/tests/unittest/ops/test_grouped_b2b_bmm.py
@@ -0,0 +1,383 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for grouped b2b bmm Operators.
+"""
+import itertools
+import logging
+import os
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar, JaggedDim
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class GroupedFMHAStyleB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+    def _test_grouped_fmha_style_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        max_seq_lens: Tuple[int, List[int]] = 256,
+        head_dim=128,
+        head_dim_value=256,
+        num_heads: Tuple[int, List[int]] = 1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        offsets_dtype="int32",
+        test_name="grouped_fmha_style_b2b_bmm",
+        alpha1_divide_by_seq_len=True,
+        copy_op=True,
+        atol=1e-3,
+        rtol=1e-3,
+        use_fp16_acc=False,
+        write_standalone_testcase_data: bool = False,
+    ):
+        # Initialize AIT fmha_style_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes, batch_sizes]
+        if isinstance(max_seq_lens, int):
+            max_seq_lens = [max_seq_lens, max_seq_lens]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads, num_heads]
+        alpha0 = 1.0 / (head_dim**0.5)
+        batch_size_dim = IntVar(
+            values=[min(batch_sizes), max(batch_sizes)], name="batch_size"
+        )
+        max_seq_len_dim = shape_utils.gen_int_var_min_max(
+            max_seq_lens, name="max_seq_len"
+        )
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, name="num_heads")
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_len_dim)]
+        total_length_dim = IntVar(
+            values=[0, batch_size_dim.upper_bound() * max_seq_len_dim.upper_bound()],
+            name="total_length",
+        )
+        offsets_dim = IntVar(
+            values=[batch_size_dim.lower_bound() + 1, batch_size_dim.upper_bound() + 1],
+            name="offset_length",
+        )
+        Q_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim_value],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        offsets = [
+            Tensor(
+                shape=[offsets_dim], name="offsets", dtype=offsets_dtype, is_input=True
+            )
+        ]
+        Q = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            Q_dense, offsets
+        )
+        K = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            K_dense, offsets
+        )
+        V = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            V_dense, offsets
+        )
+        Bias = None
+        if has_bias:
+            shape = [batch_size_dim, num_heads_dim, max_seq_len_dim, max_seq_len_dim]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        shape[i] = 1
+            Bias = Tensor(
+                shape=shape,
+                dtype=dtype,
+                name="bias",
+                is_input=True,
+            )
+        grouped_fmha_style_b2b_bmm_op = ops.grouped_fmha_style_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=1.0,
+            alpha1_divide_by_seq_len=alpha1_divide_by_seq_len,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            grouped_fmha_style_b2b_bmm_op = ops.grouped_fmha_style_b2b_bmm(
+                **grouped_fmha_style_b2b_bmm_op._get_op_attributes()
+            )
+        Y = grouped_fmha_style_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
+        written_testcase_idx = 0
+        for batch_size, max_seq_len, num_head in itertools.product(
+            sorted(set(batch_sizes)), sorted(set(max_seq_lens)), sorted(set(num_heads))
+        ):
+            # Initialize inputs
+            lengths = torch.randint(
+                1, max_seq_len, (batch_size + 1,), dtype=offsets_torch_dtype
+            )
+            lengths[0] = 0
+            offsets = torch.cumsum(lengths, dim=0).to(dtype=offsets_torch_dtype)
+            # print(f"{batch_size=}, {offsets=}")
+            total_length = offsets[-1]
+            offsets_pt = offsets.cuda()
+            q_pt = torch.rand(
+                (total_length, num_head, head_dim), dtype=torch_dtype
+            ).cuda()
+            k_pt = torch.rand(
+                (total_length, num_head, head_dim), dtype=torch_dtype
+            ).cuda()
+            v_pt = torch.rand(
+                (total_length, num_head, head_dim_value), dtype=torch_dtype
+            ).cuda()
+            bias_shape = [batch_size, num_head, max_seq_len, max_seq_len]
+            if bias_broadcast:
+                for i, broadcast in enumerate(bias_broadcast):
+                    if broadcast:
+                        bias_shape[i] = 1
+            bias_pt = torch.rand(bias_shape, dtype=torch_dtype).cuda()
+
+            # Run AIT.
+            inputs = {
+                "q": q_pt,
+                "k": k_pt,
+                "v": v_pt,
+                "offsets": offsets_pt,
+            }
+            if has_bias:
+                inputs["bias"] = bias_pt
+            y = torch.empty(
+                [total_length, num_head, head_dim_value],
+                dtype=torch_dtype,
+                device="cuda",
+            )
+            module.run_with_tensors(inputs, [y])
+            if write_standalone_testcase_data:
+                written_testcase_idx += 1
+                os.makedirs(f"./tmp/{test_name}/test_cases", exist_ok=True)
+                fname = (
+                    f"./tmp/{test_name}/test_cases/testcase.{written_testcase_idx}.data"
+                )
+                _LOGGER.info(f"Writing standalone testcase data to {fname}")
+                module.write_standalone_testcase_data(
+                    fname,
+                    inputs,
+                    [y],
+                )
+
+            # Run PT reference and verify results.
+            for row in range(batch_size):
+                start = offsets[row]
+                end = offsets[row + 1]
+                length = end - start
+                q_pt_row = q_pt[start:end, :, :]
+                k_pt_row = k_pt[start:end, :, :]
+                v_pt_row = v_pt[start:end, :, :]
+                attn = alpha0 * (
+                    q_pt_row.transpose(0, 1)
+                    @ k_pt_row.transpose(0, 1).transpose(-2, -1)
+                )
+                if has_bias:
+                    bias_row = (
+                        0 if (bias_broadcast is not None and bias_broadcast[0]) else row
+                    )
+                    bias_pt_row = bias_pt[
+                        bias_row : bias_row + 1, :, :length, :length
+                    ].squeeze(dim=0)
+                    attn = attn + bias_pt_row
+                attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(attn)
+                if alpha1_divide_by_seq_len:
+                    attn /= max_seq_len
+                invalid_attn_mask = get_attn_mask_per_causal_type(
+                    length, length, causal_type, torch_dtype
+                )
+                attn = attn * invalid_attn_mask
+                output = (attn @ v_pt_row.transpose(0, 1)).transpose(0, 1)
+                y_pt_row = output.detach()
+                # print(
+                #     f"{batch_size=}, {row=}, {y[start:end, :, :]=}, {y_pt_row.to(torch_dtype)=}"
+                # )
+                torch.testing.assert_close(
+                    y[start:end, :, :], y_pt_row.to(torch_dtype), atol=atol, rtol=rtol
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_fmha_style_b2b_bmm_fp16(self):
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_basic",
+            dtype="float16",
+            batch_sizes=1,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_batch",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_batch_fp16_acc",
+            dtype="float16",
+            batch_sizes=[3, 8, 10],
+            use_fp16_acc=True,
+            # Need to use a larger threshold for fp16 accum, it seems that
+            # torch always generates the same result regardless of
+            # how allow_fp16_reduced_precision_reduction is set.
+            atol=1e-2,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_causal_upper_right_empty",
+            dtype="float16",
+            batch_sizes=2,
+            causal_type=CausalType.UPPER_RIGHT_EMPTY,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_causal_lower_left_empty",
+            dtype="float16",
+            batch_sizes=3,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_bias",
+            dtype="float16",
+            batch_sizes=2,
+            has_bias=True,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_bias_broadcast",
+            dtype="float16",
+            batch_sizes=3,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_seq_len",
+            dtype="float16",
+            max_seq_lens=[128, 256],
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_sigmoid",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            epilogue_math_name="Sigmoid",
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_multi_head",
+            dtype="float16",
+            batch_sizes=[1, 4],
+            has_bias=True,
+            num_heads=2,
+            bias_broadcast=[True, True, True, False],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_dynamic_multi_head",
+            dtype="float16",
+            num_heads=[2, 4],
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_complex",
+            dtype="float16",
+            offsets_dtype="int64",
+            batch_sizes=[3, 4],
+            epilogue_math_name="SiLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, True, False, False],
+            num_heads=4,
+        )
+        self._test_grouped_fmha_style_b2b_bmm(
+            test_name="grouped_fmha_style_b2b_bmm_fp16_complex_fp16_acc",
+            dtype="float16",
+            batch_sizes=[1, 4, 10, 512, 1024],
+            epilogue_math_name="ReLu",
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            has_bias=True,
+            bias_broadcast=[False, False, True, False],
+            num_heads=2,
+            use_fp16_acc=True,
+            max_seq_lens=1024,
+            # Need to use a larger threshold for fp16 accum, it seems that
+            # torch always generates the same result regardless of
+            # how allow_fp16_reduced_precision_reduction is set.
+            atol=1e-2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_fmha_b2b_bmm_profile_1(
+        self,
+    ):
+        for max_seq_len in [256]:
+            self._test_grouped_fmha_style_b2b_bmm(
+                test_name=f"grouped_fmha_b2b_bmm_profile_1_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[
+                    4,
+                    8,
+                    16,
+                    32,
+                    64,
+                ],
+                max_seq_lens=max_seq_len,
+                num_heads=[1],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                has_bias=True,
+                bias_broadcast=[True, True, False, False],
+                # write_standalone_testcase_data=True,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_grouped_classic_b2b_bmm.py b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
new file mode 100644
index 000000000..b928ea01d
--- /dev/null
+++ b/tests/unittest/ops/test_grouped_classic_b2b_bmm.py
@@ -0,0 +1,606 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for grouped b2b bmm Operators.
+"""
+import logging
+import os
+
+import unittest
+from typing import List, Tuple
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar, JaggedDim
+from aitemplate.compiler.ops.b2b_bmm.b2b_bmm_base import CausalType
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    epilogue_math_name_to_torch_fn,
+    get_attn_mask_per_causal_type,
+)
+from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+_LOGGER = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+    "Not supported by CUDA < SM80.",
+)
+class GroupedClassicB2bBmmTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        pass
+
+    def _test_grouped_classic_b2b_bmm(
+        self,
+        batch_sizes: Tuple[int, List[int]] = 1024,
+        max_seq_lens: Tuple[int, List[int]] = 256,
+        head_dim=128,
+        head_dim_value=256,
+        num_heads: Tuple[int, List[int]] = 1,
+        has_bias=False,
+        bias_broadcast=None,
+        epilogue_math_name="Identity",
+        causal_type=CausalType.NO_CAUSAL,
+        dtype="float16",
+        offsets_dtype="int32",
+        test_name="grouped_classic_b2b_bmm",
+        alpha1_divide_by_seq_len=True,
+        copy_op=True,
+        atol=0.01,
+        rtol=0.01,
+        use_fp16_acc=False,
+        random_seed=0,
+        write_standalone_testcase_data: bool = False,
+    ):
+        if isinstance(random_seed, list):
+            random_seeds = random_seed
+        else:
+            random_seeds = [random_seed]
+        # Initialize AIT classic_b2b_bmm operator.
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes, batch_sizes]
+        if isinstance(max_seq_lens, int):
+            max_seq_lens = [max_seq_lens, max_seq_lens]
+        if isinstance(num_heads, int):
+            num_heads = [num_heads, num_heads]
+        alpha0 = 1.0 / (head_dim**0.5)
+        batch_size_dim = IntVar(
+            values=[min(batch_sizes), max(batch_sizes)], name="batch_size"
+        )
+        max_seq_len_dim = shape_utils.gen_int_var_min_max(
+            max_seq_lens, name="max_seq_len"
+        )
+        num_heads_dim = shape_utils.gen_int_var_min_max(num_heads, name="num_heads")
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_len_dim)]
+        total_length_dim = IntVar(
+            values=[0, batch_size_dim.upper_bound() * max_seq_len_dim.upper_bound()],
+            name="total_length",
+        )
+        offsets_dim = IntVar(
+            values=[batch_size_dim.lower_bound() + 1, batch_size_dim.upper_bound() + 1],
+            name="offset_length",
+        )
+        Q_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="q",
+            is_input=True,
+        )
+        K_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim],
+            dtype=dtype,
+            name="k",
+            is_input=True,
+        )
+        V_dense = Tensor(
+            shape=[total_length_dim, num_heads_dim, head_dim_value],
+            dtype=dtype,
+            name="v",
+            is_input=True,
+        )
+        offsets = [
+            Tensor(
+                shape=[offsets_dim], name="offsets", dtype=offsets_dtype, is_input=True
+            )
+        ]
+        Q = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            Q_dense, offsets
+        )
+        K = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            K_dense, offsets
+        )
+        V = ops.make_jagged(batch_dim=batch_size_dim, jagged_dims=jagged_dims)(
+            V_dense, offsets
+        )
+        shape = [batch_size_dim, num_heads_dim, max_seq_len_dim, max_seq_len_dim]
+        if bias_broadcast:
+            for i, broadcast in enumerate(bias_broadcast):
+                if broadcast:
+                    shape[i] = 1
+        Bias = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="bias",
+            is_input=True,
+        )
+        grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+            causal_type=causal_type,
+            alpha0=alpha0,
+            alpha1=1.0,
+            alpha1_divide_by_seq_len=alpha1_divide_by_seq_len,
+            epilogue_math_name=epilogue_math_name,
+        )
+        if copy_op:
+            grouped_classic_b2b_bmm_op = ops.grouped_classic_b2b_bmm(
+                **grouped_classic_b2b_bmm_op._get_op_attributes()
+            )
+        Y = grouped_classic_b2b_bmm_op(Q, K, V, Bias)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+
+        target = detect_target(use_fp16_acc=use_fp16_acc)
+        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = use_fp16_acc
+        module = compile_model(Y, target, "./tmp", test_name)
+        # input(f"Connect debugger. {os.getpid()=}")
+        # Run tests.
+        torch_dtype = string_to_torch_dtype(dtype)
+        offsets_torch_dtype = string_to_torch_dtype(offsets_dtype)
+        y_results = {}
+        written_testcase_idx = 0
+        for random_seed in random_seeds:
+            torch.manual_seed(random_seed)
+            for max_seq_len in sorted(set(max_seq_lens)):
+                for num_head in sorted(set(num_heads)):
+                    batch_sizes_sorted = sorted(set(batch_sizes), reverse=True)
+                    max_batch_size = batch_sizes_sorted[0]
+                    lengths_max = torch.randint(
+                        1, max_seq_len, (max_batch_size + 1,), dtype=offsets_torch_dtype
+                    )
+                    lengths_max[0] = 0
+                    offsets_max = torch.cumsum(lengths_max, dim=0).to(
+                        dtype=offsets_torch_dtype
+                    )
+                    # print(f"{batch_size=}, {offsets=}")
+                    total_length_max = offsets_max[-1]
+                    offsets_max_pt = offsets_max.cuda()
+                    q_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim), dtype=torch_dtype
+                    ).cuda()
+                    k_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim), dtype=torch_dtype
+                    ).cuda()
+                    v_pt_max = torch.rand(
+                        (total_length_max, num_head, head_dim_value), dtype=torch_dtype
+                    ).cuda()
+                    ## TEMP DEBUG
+                    for i in range(len(offsets_max) - 1):
+                        start = offsets_max[i]
+                        end = offsets_max[i + 1]
+                        q_pt_max[start:end, :, :] += (
+                            0.125 * i
+                        )  # Making sure not everything averages out to zero
+                        k_pt_max[start:end, :, :] += (
+                            -0.25 * i + 0.15
+                        )  # Making sure not everything averages out to zero
+                        v_pt_max[start:end, :, :] += (
+                            0.375 * i - 0.0125
+                        )  # Making sure not everything averages out to zero
+                    ## END TEMP DEBUG
+                    bias_shape_max = [
+                        max_batch_size,
+                        num_head,
+                        max_seq_len,
+                        max_seq_len,
+                    ]
+                    if bias_broadcast:
+                        for i, broadcast in enumerate(bias_broadcast):
+                            if broadcast:
+                                bias_shape_max[i] = 1
+                    bias_pt_max = torch.rand(bias_shape_max, dtype=torch_dtype).cuda()
+                    if not has_bias:
+                        bias_pt_max *= 0.0
+                    results_per_batch = {}
+
+                    for batch_size in batch_sizes_sorted:
+                        # Initialize inputs
+                        # input(f"Attach debugger if you want. {os.getpid()=}. Press Enter to continue.")
+                        total_length = offsets_max[batch_size]
+                        q_pt = q_pt_max[:total_length, :, :].contiguous()
+                        k_pt = k_pt_max[:total_length, :, :].contiguous()
+                        v_pt = v_pt_max[:total_length, :, :].contiguous()
+                        bias_pt = bias_pt_max[:batch_size, :, :, :].contiguous()
+                        offsets_pt = offsets_max_pt[: batch_size + 1].contiguous()
+                        # Run AIT.
+                        inputs = {
+                            "q": q_pt,
+                            "k": k_pt,
+                            "v": v_pt,
+                            "offsets": offsets_pt,
+                            "bias": bias_pt,
+                        }
+                        y = torch.empty(
+                            [total_length, num_head, head_dim_value],
+                            dtype=torch_dtype,
+                            device="cuda",
+                        )
+                        ypadded = torch.zeros(
+                            y.flatten().shape[0] + 128,
+                            dtype=torch_dtype,
+                            device=y.device,
+                        )
+                        y = ypadded[64:-64].reshape(
+                            [total_length, num_head, head_dim_value]
+                        )
+                        module.run_with_tensors(inputs, [y])
+                        if write_standalone_testcase_data:
+                            written_testcase_idx += 1
+                            os.makedirs(f"./tmp/{test_name}/test_cases", exist_ok=True)
+                            fname = f"./tmp/{test_name}/test_cases/testcase.{written_testcase_idx}.data"
+                            _LOGGER.info(f"Writing standalone testcase data to {fname}")
+                            module.write_standalone_testcase_data(
+                                fname,
+                                inputs,
+                                [y],
+                            )
+
+                        y_results[(batch_size, max_seq_len, num_head)] = y
+                        assert torch.all(
+                            ypadded[:64] == 0
+                        )  # Make sure we're not writing beyond boundaries
+                        assert torch.all(
+                            ypadded[-64:] == 0
+                        )  # Make sure we're not writing beyond boundaries
+                        # Run PT reference and verify results.
+                        for row in range(batch_size):
+                            start = offsets_max[row]
+                            end = offsets_max[row + 1]
+                            length = end - start
+                            q_pt_row = q_pt[start:end, :, :]
+                            k_pt_row = k_pt[start:end, :, :]
+                            v_pt_row = v_pt[start:end, :, :]
+                            attn = alpha0 * (
+                                q_pt_row.transpose(0, 1)
+                                @ k_pt_row.transpose(0, 1).transpose(-2, -1)
+                            )
+                            if has_bias:
+                                bias_row = (
+                                    0
+                                    if (
+                                        bias_broadcast is not None and bias_broadcast[0]
+                                    )
+                                    else row
+                                )
+                                bias_pt_row = bias_pt[
+                                    bias_row : bias_row + 1, :, :length, :length
+                                ].squeeze(dim=0)
+                                attn = attn + bias_pt_row
+                            attn = epilogue_math_name_to_torch_fn(epilogue_math_name)(
+                                attn
+                            )
+                            if alpha1_divide_by_seq_len:
+                                attn /= max_seq_len
+                            invalid_attn_mask = get_attn_mask_per_causal_type(
+                                length, length, causal_type, torch_dtype
+                            )
+                            attn = attn * invalid_attn_mask
+                            output = (attn @ v_pt_row.transpose(0, 1)).transpose(0, 1)
+                            y_pt_row = output.detach()
+                            # print(
+                            #     f"{batch_size=}, {row=}, {y[start:end, :, :]=}, {y_pt_row.to(torch_dtype)=}"
+                            # )
+                            results_per_batch[batch_size] = {
+                                "y": y[start:end, :, :],
+                                "expected_y": y_pt_row.to(torch_dtype),
+                            }
+
+                            torch.testing.assert_close(
+                                y[start:end, :, :],
+                                y_pt_row.to(torch_dtype),
+                                atol=atol,
+                                rtol=rtol,
+                            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_1(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_1_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=1,
+            head_dim=64,
+            head_dim_value=64,
+            max_seq_lens=[64],
+            has_bias=False,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_2(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_2_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=1,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_a(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_a_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=4,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_b(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_b_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[2, 4],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_c(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_c_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_d(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_d_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[2, 33],
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_e(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_3_batch_e_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_3_batch_f_bias_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+            has_bias=True,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_3_batch_g_bias_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[2, 4],
+            num_heads=[3, 5],
+            has_bias=True,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_acc(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"grouped_classic_b2b_bmm_fp16_acc_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[7],
+            use_fp16_acc=True,
+            # Need to use a larger threshold for fp16 accum
+            atol=0.25,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty1_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[5],
+            num_heads=4,
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty2(self):
+        self._test_grouped_classic_b2b_bmm(
+            test_name="test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty2",
+            dtype="float16",
+            batch_sizes=[1, 5, 33],
+            num_heads=[
+                2,
+                4,
+                11,
+            ],
+            causal_type=CausalType.LOWER_LEFT_EMPTY,
+            random_seed=list(range(3)),
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_silu(self):
+        for max_seq_len in [64, 256, 512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_pid_{os.getpid()}",
+                dtype="float16",
+                batch_sizes=[1, 5, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    2,
+                    4,
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(10)),
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias(
+        self,
+    ):
+        for max_seq_len in [512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_silu_bias_pid_{os.getpid()}",
+                dtype="float16",
+                batch_sizes=[3, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(3)),
+                has_bias=True,
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_1(
+        self,
+    ):
+        for max_seq_len in [512]:
+            for random_seed in range(1):
+                self._test_grouped_classic_b2b_bmm(
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_1_pid_{os.getpid()}",
+                    dtype="float16",
+                    batch_sizes=[3, 33],
+                    max_seq_lens=max_seq_len,
+                    num_heads=[
+                        11,
+                    ],
+                    epilogue_math_name="SiLu",
+                    causal_type=CausalType.LOWER_LEFT_EMPTY,
+                    random_seed=random_seed,
+                    has_bias=True,
+                    bias_broadcast=[True, False, True, False],
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_2(
+        self,
+    ):
+        for max_seq_len in [512]:
+            for random_seed in range(1):
+                self._test_grouped_classic_b2b_bmm(
+                    test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_seed{random_seed}_bias_broadcast_2_pid_{os.getpid()}",
+                    dtype="float16",
+                    batch_sizes=[3, 33],
+                    max_seq_lens=max_seq_len,
+                    num_heads=[
+                        11,
+                    ],
+                    epilogue_math_name="SiLu",
+                    causal_type=CausalType.LOWER_LEFT_EMPTY,
+                    random_seed=random_seed,
+                    has_bias=True,
+                    bias_broadcast=[True, False, False, False],
+                )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_causal_lower_left_empty3_simple_silu_bias_broadcast_3(
+        self,
+    ):
+        for max_seq_len in [512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"grouped_classic_b2b_bmm_fp16_causal_lower_left_empty_seqlen_{max_seq_len}_bias_broadcast_3_pid_{os.getpid()}",
+                dtype="float16",
+                batch_sizes=[3, 33],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    11,
+                ],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(12, 24)),
+                has_bias=True,
+                bias_broadcast=[True, True, True, False],
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4(
+        self,
+    ):
+        for max_seq_len in [64, 256, 512]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_4_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[16],
+                max_seq_lens=max_seq_len,
+                num_heads=[
+                    2,
+                    11,
+                ],
+                random_seed=list(range(5)),
+                has_bias=True,
+                bias_broadcast=[True, False, True, False],
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5(
+        self,
+    ):
+        self._test_grouped_classic_b2b_bmm(
+            test_name=f"test_grouped_classic_b2b_bmm_fp16_large_bias_broadcast_5_pid_{os.getpid()}",
+            dtype="float16",
+            batch_sizes=[3, 33],
+            max_seq_lens=256,
+            num_heads=[
+                2,
+                11,
+            ],
+            random_seed=list(range(3400, 3411)),
+            has_bias=True,
+            bias_broadcast=[True, False, True, False],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_grouped_classic_b2b_bmm_profile_1(
+        self,
+    ):
+        for max_seq_len in [256]:
+            self._test_grouped_classic_b2b_bmm(
+                test_name=f"test_grouped_classic_b2b_bmm_profile_1_seqlen_{max_seq_len}",
+                dtype="float16",
+                batch_sizes=[
+                    4,
+                    8,
+                    16,
+                    32,
+                    64,
+                ],
+                max_seq_lens=max_seq_len,
+                num_heads=[1],
+                epilogue_math_name="SiLu",
+                causal_type=CausalType.LOWER_LEFT_EMPTY,
+                random_seed=list(range(1)),
+                has_bias=True,
+                bias_broadcast=[True, True, False, False],
+                # write_standalone_testcase_data=True,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_groupnorm.py b/tests/unittest/ops/test_groupnorm.py
index 484db1d25..fb65ddfd1 100644
--- a/tests/unittest/ops/test_groupnorm.py
+++ b/tests/unittest/ops/test_groupnorm.py
@@ -15,6 +15,7 @@
 """
 Unittests for group norm Operator.
 """
+import logging
 import unittest
 
 import torch
@@ -22,7 +23,10 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target()._arch == "75", "Skip GN on sm75.")
@@ -41,27 +45,28 @@ def _test_groupnorm(
         eps=1e-5,
         use_swish=False,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
+        dtype="float16",
     ):
         test_name = "group_norm_swish" if use_swish else "group_norm"
-        logger.info(
-            __file__, f"Testing {test_name}: {x_shape}, num_groups: {num_groups}"
-        )
+        _LOGGER.info(f"Testing {test_name}: {x_shape}, num_groups: {num_groups}")
         num_channels = x_shape[-1]
         X1 = Tensor(
             shape=x_shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
         X2 = Tensor(
             shape=[num_channels],
-            dtype="float16",
+            dtype=dtype,
             name="gamma",
             is_input=True,
         )
         X3 = Tensor(
             shape=[num_channels],
-            dtype="float16",
+            dtype=dtype,
             name="beta",
             is_input=True,
         )
@@ -78,10 +83,10 @@ def _test_groupnorm(
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(X4, target, "./tmp", op_name, dll_name=dll_name)
 
-        x1_nhwc_pt = torch.randn(*x_shape).cuda().half()
+        x1_nhwc_pt = get_random_torch_tensor(x_shape, dtype)
         x1_nchw_pt = x1_nhwc_pt.permute(0, 3, 1, 2).contiguous()
-        gamma_pt = torch.randn(num_channels).cuda().half()
-        beta_pt = torch.randn(num_channels).cuda().half()
+        gamma_pt = get_random_torch_tensor((num_channels,), dtype)
+        beta_pt = torch.randn_like(gamma_pt)
 
         x4_pt = torch.nn.functional.group_norm(
             x1_nchw_pt, num_groups, gamma_pt, beta_pt, eps=eps
@@ -92,7 +97,7 @@ def _test_groupnorm(
         inputs = {"X": x1_nhwc_pt}
         inputs["gamma"] = gamma_pt
         inputs["beta"] = beta_pt
-        x4 = torch.empty(x_shape).cuda().half()
+        x4 = torch.empty_like(x1_nhwc_pt)
         module.run_with_tensors(inputs, [x4])
 
         # from aitemplate.testing.benchmark_pt import benchmark_torch_function
@@ -108,23 +113,19 @@ def _test_groupnorm(
         # )
         # print("pt: ", t)
 
-        self.assertTrue(
-            torch.allclose(
-                x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=1e-2, rtol=1e-2
-            )
+        torch.testing.assert_close(
+            x4, x4_pt.permute(0, 2, 3, 1).contiguous(), atol=atol, rtol=rtol
         )
         self.test_count += 1
 
-    def test_groupnorm(self):
+    def test_groupnorm_float16(self):
         self._test_groupnorm()
-        self._test_groupnorm(x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5)
         self._test_groupnorm(x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 16, 16, 8192], num_groups=32, eps=1e-3)
         self._test_groupnorm(x_shape=[3, 64, 64, 128], num_groups=16, eps=1e-5)
         self._test_groupnorm(x_shape=[3, 33, 64, 120], num_groups=10, eps=1e-5)
         self._test_groupnorm(x_shape=[8, 34, 10, 72], num_groups=6, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 64], num_groups=32, eps=1e-5)
-        self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5)
         self._test_groupnorm(x_shape=[1, 8, 1, 4], num_groups=2, eps=1e-5, copy_op=True)
 
     def test_groupnorm_swish(self):
@@ -133,33 +134,85 @@ def test_groupnorm_swish(self):
             x_shape=[3, 3, 1, 4], num_groups=2, eps=1e-5, use_swish=True
         )
         self._test_groupnorm(
-            x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5, use_swish=True
+            x_shape=[7, 13, 9, 12], num_groups=4, eps=1e-5, use_swish=True, copy_op=True
+        )
+        self._test_groupnorm(
+            x_shape=[2, 8, 8, 1280], num_groups=32, eps=1e-5, use_swish=True
+        )
+        self._test_groupnorm(
+            x_shape=[2, 32, 32, 320], num_groups=32, eps=1e-5, use_swish=True
+        )
+        self._test_groupnorm(
+            x_shape=[1, 512, 512, 256], num_groups=32, eps=1e-5, use_swish=True
         )
 
-        shapes = [
-            (2, 16, 16, 1280),
-            (2, 16, 16, 1920),
-            (2, 16, 16, 2560),
-            (2, 16, 16, 640),
-            (2, 32, 32, 1280),
-            (2, 32, 32, 1920),
-            (2, 32, 32, 320),
-            (2, 32, 32, 640),
-            (2, 32, 32, 960),
-            (2, 64, 64, 320),
-            (2, 8, 8, 1280),
-            (2, 8, 8, 2560),
-            (2, 64, 64, 640),
-            (2, 64, 64, 960),
-            (1, 256, 256, 128),
-            (1, 512, 512, 256),
-        ]
-
-        for shape in shapes:
-            self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
-            self._test_groupnorm(
-                x_shape=shape, num_groups=32, eps=1e-5, use_swish=True, copy_op=True
-            )
+        # For benchmark only.
+        # shapes = [
+        #     (2, 16, 16, 1280),
+        #     (2, 16, 16, 1920),
+        #     (2, 16, 16, 2560),
+        #     (2, 16, 16, 640),
+        #     (2, 32, 32, 1280),
+        #     (2, 32, 32, 1920),
+        #     (2, 32, 32, 320),
+        #     (2, 32, 32, 640),
+        #     (2, 32, 32, 960),
+        #     (2, 64, 64, 320),
+        #     (2, 8, 8, 1280),
+        #     (2, 8, 8, 2560),
+        #     (2, 64, 64, 640),
+        #     (2, 64, 64, 960),
+        #     (1, 256, 256, 128),
+        #     (1, 512, 512, 256),
+        # ]
+        # for shape in shapes:
+        #     self._test_groupnorm(x_shape=shape, num_groups=32, eps=1e-5, use_swish=True)
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_groupnorm_float32(self):
+        # H % 8 != 0
+        self._test_groupnorm(
+            x_shape=[7, 13, 9, 12],
+            num_groups=4,
+            eps=1e-5,
+            dtype="float32",
+            use_swish=True,
+        )
+        # H % 8 == 0
+        self._test_groupnorm(
+            x_shape=[2, 16, 16, 640],
+            num_groups=32,
+            eps=1e-5,
+            dtype="float32",
+            use_swish=True,
+        )
+
+    @unittest.skipIf(
+        detect_target().name() == "cuda" and int(detect_target()._arch) < 80,
+        "bf16 is supported with CUDA sm80+",
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    def test_groupnorm_bfloat16(self):
+        # H % 8 != 0
+        self._test_groupnorm(
+            x_shape=[7, 13, 9, 12],
+            num_groups=4,
+            eps=1e-5,
+            atol=1e-1,
+            rtol=1e-1,
+            dtype="bfloat16",
+            use_swish=True,
+        )
+        # H % 8 == 0
+        self._test_groupnorm(
+            x_shape=[2, 16, 16, 640],
+            num_groups=32,
+            eps=1e-5,
+            atol=1e-1,
+            rtol=1e-1,
+            dtype="bfloat16",
+            use_swish=True,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_identity.py b/tests/unittest/ops/test_identity.py
new file mode 100644
index 000000000..52277a2bb
--- /dev/null
+++ b/tests/unittest/ops/test_identity.py
@@ -0,0 +1,86 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+class TestIdentity(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_identity(
+        self,
+        shape,
+        elementwise,
+        dtype="float16",
+        test_name="identity",
+    ) -> None:
+        X = Tensor(
+            shape=shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        Y = ops.identity()(X)
+        if elementwise:
+            Y = ops.elementwise(FuncEnum.ADD)(X, Y)
+        Y._attrs["name"] = "output"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self.assertEqual(len(module.debug_sorted_graph), 2)
+        self._test_id += 1
+
+        x_pt = get_random_torch_tensor(shape, dtype=dtype)
+        if elementwise:
+            y_pt = 2 * x_pt
+        else:
+            y_pt = x_pt
+
+        y = torch.empty_like(y_pt)
+
+        module.run_with_tensors([x_pt], [y])
+        torch.testing.assert_close(y, y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param(1, [3, 4], True, "float16"),
+            param(2, [3, 4], True, "float32"),
+            param(3, [3, 4], False, "float16"),
+            param(4, [3, 4], False, "float32"),
+        ]
+    )
+    def test_identity(self, i, shape, elementwise, dtype):
+        self._test_identity(
+            shape=shape,
+            elementwise=elementwise,
+            dtype=dtype,
+            test_name=f"test_identity_{i}",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_index_select.py b/tests/unittest/ops/test_index_select.py
new file mode 100644
index 000000000..a219ab311
--- /dev/null
+++ b/tests/unittest/ops/test_index_select.py
@@ -0,0 +1,287 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for masked_select Operator.
+"""
+import logging
+import random
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVar
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
+
+logger = logging.getLogger(__name__)
+
+
+@unittest.skipIf(
+    detect_target().name() == "rocm", "masked_select is not implemented for ROCm"
+)
+class IndexSelectTest(unittest.TestCase):
+    @staticmethod
+    def _get_output_shape(shape, dim_idx, dim_idx_len):
+        ret = []
+        for idx, dim in enumerate(shape):
+            if idx == dim_idx:
+                ret.append(dim_idx_len)
+                continue
+            ret.append(dim)
+        return ret
+
+    def _test_index_select(
+        self,
+        shape=(2, 2),
+        x_shape=None,
+        dim_idxs_shape=None,
+        dim_idx=1,
+        dim_idx_len=1,
+        test_name="index_select",
+        dtype="float16",
+        benchmark=False,
+        dim_idxs=None,
+    ):
+
+        X1 = Tensor(
+            shape=shape if x_shape is None else x_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=(dim_idx_len,) if dim_idxs_shape is None else dim_idxs_shape,
+            dtype="int64",
+            name="dim_idxs",
+            is_input=True,
+        )
+        X4_op = ops.index_select(dim_idx)
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        y = torch.empty(
+            IndexSelectTest._get_output_shape(shape, dim_idx, dim_idx_len),
+            dtype=x.dtype,
+            device=x.device,
+        )
+
+        if dim_idxs is None:
+            dim_idxs = torch.arange(end=dim_idx_len, dtype=torch.int64, device=x.device)
+
+        y_ait = module.run_with_tensors([x, dim_idxs], [y])["output_values"]
+        y_pt = torch.index_select(x, dim_idx, dim_idxs)
+        self.assertTrue(torch.equal(y_pt, y_ait))
+
+        if benchmark:
+            print(
+                f"Benchmarking with shape={shape}, dim_idx={dim_idx}, dim_idx_len={dim_idx_len}, dtype={dtype}"
+            )
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors([x, dim_idxs], [y])
+            # Benchmark.
+            num_benchmark_iter = 1000
+
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                [x, dim_idxs], [y], count=num_benchmark_iter
+            )
+
+            print(f"AITemplate time: {time_per_iter_ms:.4f}ms")
+
+            func = torch.index_select
+            args = (x, dim_idx, dim_idxs)
+            # Warm up.
+            for _ in range(5):
+                func(*args)
+            # Benchmark.
+            torch_time_per_iter_ms = benchmark_torch_function(
+                num_benchmark_iter, func, *args
+            )
+            print(f"PyTorch time: {torch_time_per_iter_ms:.4f}ms")
+
+            print(f"Speedup: {torch_time_per_iter_ms / time_per_iter_ms:.6f}x")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [
+                (IntVar(values=[1, 6]), IntVar(values=[1, 6])),
+                (IntVar(values=(0, 2)),),
+                (2, 2),
+                False,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 512]),),
+                (2048, 1024, 7),
+                False,  # change for benchmark
+                2,
+                7,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 512]),),
+                (2048, 1024, 7),
+                False,
+                1,
+                512,
+            ],
+            [
+                (IntVar(values=[1, 2048]), 1024, 7),
+                (IntVar(values=[1, 2048]),),
+                (2048, 1024, 7),
+                False,
+                0,
+                2048,
+            ],
+        ]
+    )
+    def test_dynamic_shape(
+        self,
+        x_shape=None,
+        dim_idxs_shape=None,
+        shape=(2, 2),
+        benchmark=False,
+        dim_idx=1,
+        dim_idx_len=1,
+        test_name="dynamic_index_select",
+        dtype="float16",
+    ):
+        self._test_index_select(
+            shape,
+            x_shape,
+            dim_idxs_shape,
+            dim_idx,
+            dim_idx_len,
+            test_name,
+            dtype,
+            benchmark,
+        )
+
+    def test_repeated_and_out_of_order(self):
+        self._test_index_select(
+            shape=(5, 4, 3, 2),
+            dim_idx=1,
+            dim_idx_len=10,
+            test_name="index_select_repeat",
+            dtype="float16",
+            dim_idxs=torch.tensor(
+                [3, 2, 0, 1, 2, 3, 3, 2, 1, 0], dtype=torch.int64, device="cuda"
+            ),
+        )
+
+    def test_negative_dim(self):
+        for dim_idx in range(1, 5):
+            self._test_index_select(
+                shape=(5, 4, 3, 2),
+                dim_idx=-dim_idx,
+                dim_idx_len=1,
+                test_name="index_select_negative_idx",
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), False],
+            # Uncomment to benchmark
+            # [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp32(self, shape, benchmark):
+        torch.manual_seed(1024)
+        random.seed(1024)
+        for idx, _ in enumerate(shape):
+            for dim_idx_len in [1, int(shape[idx] / 2), shape[idx]]:
+                self._test_index_select(
+                    shape=shape,
+                    dim_idx=idx,
+                    dim_idx_len=dim_idx_len if dim_idx_len > 0 else 1,
+                    test_name="index_select_fp32",
+                    dtype="float32",
+                    benchmark=benchmark,
+                )
+
+    @parameterized.expand(
+        [
+            [(5, 4, 3, 2), False],
+            # [(2, 6), False],
+            # [(20, 6), False],
+            # [(300, 80), False],
+            # Uncomment to benchmark
+            # [(5, 4, 3, 2), True],
+            # [(2, 6), True],
+            # [(20, 6), True],
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True], #revisit
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp16(self, shape, benchmark=False):
+        torch.manual_seed(1024)
+        random.seed(1024)
+        for idx, _ in enumerate(shape):
+            for dim_idx_len in [1, int(shape[idx] / 2), shape[idx]]:
+                self._test_index_select(
+                    shape=shape,
+                    dim_idx=idx,
+                    dim_idx_len=dim_idx_len if dim_idx_len > 0 else 1,
+                    test_name="index_select_fp16",
+                    dtype="float16",
+                    benchmark=benchmark,
+                )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    random.seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
index f3d2aaff2..d1e393f2a 100644
--- a/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
+++ b/tests/unittest/ops/test_int_elementwise_dynamic_reshape.py
@@ -16,29 +16,30 @@
 
 import torch
 from aitemplate.compiler import compile_model, ops
-
+from aitemplate.compiler.base import IntImm, IntVarTensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class IntElementwiseReshapeOpTestCase(unittest.TestCase):
-    def test_int_elementwise_reshape_op(
+    def _test_int_elementwise_reshape_op(
         self,
         batch_size=(1, 3),
         x1_size=(2, 3),
         X_shape=(32, 64),
-        test_name="elementwise_reshape_op",
+        test_name="int_elementwise_reshape_op",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
         X = Tensor(
             shape=[b_dim, x1_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -56,23 +57,39 @@ def test_int_elementwise_reshape_op(
 
         for b, x1 in zip(batch_size, x1_size):
             X_shape_pt = (b, x1, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(
-                X_shape_pt[1] * X_shape_pt[0], X_shape_pt[2], X_shape_pt[3]
+                X_shape_pt[1] * X_shape_pt[0],
+                X_shape_pt[2],
+                X_shape_pt[3],
             )
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_int_elementwise_reshape_op2(
+    def test_int_elementwise_reshape_op_fp16(self):
+        self._test_int_elementwise_reshape_op(
+            test_name="int_elementwise_reshape_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_fp32(self):
+        self._test_int_elementwise_reshape_op(
+            test_name="int_elementwise_reshape_op_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op2(
         self,
         batch_size=(1, 3),
         x1_size=(2, 3),
         x2_size=(10, 32),
         x3_size=(48, 64),
-        test_name="elementwise_reshape_op2",
+        test_name="int_elementwise_reshape_op2",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
@@ -81,7 +98,7 @@ def test_int_elementwise_reshape_op2(
         x3_dim = shape_utils.gen_int_var_min_max(x3_size, name="x3_size")
         X = Tensor(
             shape=[b_dim, x1_dim, x2_dim, x3_dim],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -101,14 +118,146 @@ def test_int_elementwise_reshape_op2(
 
         for b, x1, x2, x3 in zip(batch_size, x1_size, x2_size, x3_size):
             X_shape_pt = (b, x1, x2, x3)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(-1, X_shape_pt[3])
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
+    def test_int_elementwise_reshape_op2_fp16(self):
+        self._test_int_elementwise_reshape_op2(
+            test_name="int_elementwise_reshape_op2_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op2_fp32(self):
+        self._test_int_elementwise_reshape_op2(
+            test_name="int_elementwise_reshape_op2_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op_imm(
+        self,
+        batch_size=(3, 5),
+        x1_size=(2, 3),
+        X_shape=(32, 64),
+        test_name="int_elementwise_reshape_op_imm",
+        dtype="float16",
+    ):
+        target = detect_target()
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
+        X = Tensor(
+            shape=[b_dim, x1_dim, *X_shape],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        Y5 = ops.getitem()(Y1, 3)
+        f1 = ops.int_elementwise(FuncEnum.MUL)(Y4, Y5)
+        f2 = IntVarTensor(IntImm(12))
+
+        Y = ops.reshape()(X, [Y2 * Y3 * f1 / f2, f2])
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b, x1 in zip(batch_size, x1_size):
+            X_shape_pt = (b, x1, *X_shape)
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            Y_pt = X_pt.reshape(
+                int(X_shape_pt[0] * X_shape_pt[1] * X_shape_pt[2] * X_shape_pt[3] / 12),
+                12,
+            )
+
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors([X_pt], [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_int_elementwise_reshape_op_imm_fp16(self):
+        self._test_int_elementwise_reshape_op_imm(
+            test_name="int_elementwise_reshape_op_imm_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_imm_fp32(self):
+        self._test_int_elementwise_reshape_op_imm(
+            test_name="int_elementwise_reshape_op_imm_fp32",
+            dtype="float32",
+        )
+
+    def _test_int_elementwise_reshape_op_add(
+        self,
+        batch_size=(3, 5),
+        X_shape=(4, 8),
+        test_name="int_elementwise_reshape_op_add",
+        dtype="float16",
+    ):
+        target = detect_target()
+        last_dim_val = X_shape[0] * X_shape[1]
+        b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+        X0 = Tensor(
+            shape=[b_dim, *X_shape],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        X1 = Tensor(
+            shape=[b_dim, last_dim_val],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+
+        Y1 = ops.size()(X0)
+        Y2 = ops.getitem()(Y1, 0)
+        Y3 = ops.getitem()(Y1, 1)
+        Y4 = ops.getitem()(Y1, 2)
+        f1 = ops.int_elementwise(FuncEnum.MUL)(Y3, Y4)
+
+        Y5 = ops.reshape()(X0, [Y2, f1])
+        Y = ops.elementwise(FuncEnum.ADD)(Y5, X1)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_shape_pt = (b, *X_shape)
+            X0_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            X1_pt = get_random_torch_tensor([b, last_dim_val], dtype=dtype)
+            Y1_pt = X0_pt.reshape(b, last_dim_val)
+            Y_pt = Y1_pt + X1_pt
+
+            inputs = {"input_0": X0_pt, "input_1": X1_pt}
+            y = torch.empty_like(Y_pt)
+            module.run_with_tensors(inputs, [y])
+
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_int_elementwise_reshape_op_add_fp16(self):
+        self._test_int_elementwise_reshape_op_add(
+            test_name="int_elementwise_reshape_op_add_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_int_elementwise_reshape_op_add_fp32(self):
+        self._test_int_elementwise_reshape_op_add(
+            test_name="int_elementwise_reshape_op_add_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_jagged_elementwise.py b/tests/unittest/ops/test_jagged_elementwise.py
new file mode 100644
index 000000000..af1205040
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_elementwise.py
@@ -0,0 +1,573 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import add_jagged_dense_ref, generate_offsets
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class JaggedElementwiseTestCase(unittest.TestCase):
+    def _test_jagged_dense_elementwise_add(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dense_shape: List[int],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+
+        jagged_dims_max_values = jagged_max_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged_inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        assert len(dense_shape) <= len(jagged_max_shape)
+        dense_dims = [IntImm(dim) for dim in dense_shape]
+        if len(dense_shape) == len(jagged_max_shape):
+            assert dense_shape[0] == jagged_max_shape[0]
+            dense_dims[0] = batch_dim
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        DENSE = Tensor(
+            shape=dense_dims,
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, DENSE)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"test_jagged_dense_elementwise_add_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        dense_pt = get_random_torch_tensor(dense_shape, dtype)
+        result_pt = add_jagged_dense_ref(
+            jagged=source_pt,
+            offsets_list=list(offsets_pt.values()),
+            jagged_max_shape=jagged_max_shape,
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, "dense": dense_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 4], [4, 3, 4]),
+            param(2, "int32", [4, 3, 2], [4, 3, 1]),
+            param(3, "int32", [4, 3, 1], [4, 3, 2]),
+            param(4, "int32", [4, 3, 2], [4, 1, 1]),
+            param(5, "int32", [4, 3, 2], [3, 1]),
+            param(6, "int64", [4, 3, 1], [2]),
+            param(7, "int64", [4, 3, 5, 6, 8], [4, 3, 5, 6, 8]),
+            param(8, "int64", [4, 3, 1, 6, 1], [4, 3, 5, 1, 8]),
+            param(9, "int64", [4, 3, 1, 6, 1], [4, 1, 1, 1, 1]),
+            param(10, "int64", [4, 3, 1, 1, 2], [3, 5, 6, 2]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_single_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_jagged_dense_elementwise_add(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[[0, 1, 4, 6, 7]],
+                dense_shape=dense_shape,
+                dtype="float16",
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"single_offsets_fp16_{i}_{use_jagged_space_indexing}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 4], [3, 4, 5, 150, 3, 4]),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], [3, 4, 5, 150, 3, 1]),
+            param(3, "int32", [3, 4, 5, 150, 3, 4], [1]),
+            param(4, "int64", [3, 4, 5, 150, 1, 1], [150, 3, 4]),
+            param(5, "int64", [3, 4, 5, 150, 3, 4], [3, 1, 1, 1, 1, 1]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_multiple_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_jagged_dense_elementwise_add(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[
+                    [0, 1, 3, 5],
+                    [0, 2, 4, 7, 9, 10],
+                    [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+                ],
+                dense_shape=dense_shape,
+                dtype="float16",
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"multiple_offsets_fp16_{i}_{use_jagged_space_indexing}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 2], [4, 3, 2]),
+            param(2, "int64", [4, 3, 5, 6, 7], [4, 3, 5, 6, 7]),
+            param(3, "int64", [4, 3, 1, 1, 1], [3, 5, 6, 7]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_single_offsets_fp32(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        self._test_jagged_dense_elementwise_add(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dense_shape=dense_shape,
+            dtype="float32",
+            offsets_dtype=offsets_dtype,
+            use_jagged_space_indexing=False,
+            test_suffix=f"single_offsets_fp32_{i}",
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 4], [3, 4, 5, 150, 3, 4]),
+            param(2, "int64", [3, 4, 5, 150, 1, 1], [150, 3, 4]),
+        ]
+    )
+    def test_jagged_dense_elementise_add_multiple_offsets_fp32(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dense_shape,
+    ):
+        self._test_jagged_dense_elementwise_add(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dense_shape=dense_shape,
+            dtype="float32",
+            offsets_dtype=offsets_dtype,
+            use_jagged_space_indexing=False,
+            test_suffix=f"multiple_offsets_fp32_{i}",
+        )
+
+    def _test_jagged_jagged_elementwise_add(
+        self,
+        jagged_max_prefix_shape: List[int],
+        jagged1_inner_shape: List[int],
+        jagged2_inner_shape: List[int],
+        implicit_jagged_input: bool,
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+    ):
+        assert len(jagged1_inner_shape) == len(jagged2_inner_shape)
+
+        batch_size = jagged_max_prefix_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+
+        jagged_dims_max_values = jagged_max_prefix_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged1_inner_dims = [IntImm(dim) for dim in jagged1_inner_shape]
+        jagged1_input_shape = [total_length] + jagged1_inner_shape
+        jagged2_inner_dims = [IntImm(dim) for dim in jagged2_inner_shape]
+        jagged2_input_shape = [total_length] + jagged2_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        SOURCE1 = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged1_inner_dims,
+            ],
+            name="source1",
+            dtype=dtype,
+            is_input=True,
+        )
+        SOURCE2 = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged2_inner_dims,
+            ],
+            name="source2",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+
+        if implicit_jagged_input:
+            JAGGED1 = SOURCE1
+        else:
+            JAGGED1 = ops.make_jagged(
+                batch_dim=batch_dim,
+                jagged_dims=jagged_dims,
+            )(SOURCE1, OFFSETS_LIST)
+
+        JAGGED2 = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE2, OFFSETS_LIST)
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED1, JAGGED2)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        if implicit_jagged_input:
+            # SOURCE1 is "converted" into a jagged Tensor
+            # in the ops.elementwise by replacing its first
+            # dim with the JaggedIntVar from JAGGED 2
+            assert SOURCE1.is_jagged()
+        else:
+            assert not SOURCE1.is_jagged()
+        assert not SOURCE2.is_jagged()
+        assert JAGGED1.is_jagged()
+        assert JAGGED2.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_jagged_elementwise_add_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source1_pt = get_random_torch_tensor(jagged1_input_shape, dtype)
+        source2_pt = get_random_torch_tensor(jagged2_input_shape, dtype)
+        result_pt = source1_pt + source2_pt  # jagged inputs are treated as dense
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source1": source1_pt, "source2": source2_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3], [5], [5], False),
+            param(2, "int32", [4, 3], [5], [1], False),
+            param(3, "int64", [4, 3], [1], [5], True),
+            param(4, "int64", [4, 3], [5, 1, 7], [1, 6, 1], False),
+            param(5, "int64", [4, 3], [5, 6, 7], [1, 6, 7], True),
+        ]
+    )
+    def test_jagged_jagged_elementise_add_single_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_prefix_shape,
+        jagged1_inner_shape,
+        jagged2_inner_shape,
+        implicit_jagged_input,
+    ):
+        self._test_jagged_jagged_elementwise_add(
+            jagged_max_prefix_shape=jagged_max_prefix_shape,
+            jagged1_inner_shape=jagged1_inner_shape,
+            jagged2_inner_shape=jagged2_inner_shape,
+            implicit_jagged_input=implicit_jagged_input,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dtype="float16",
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"single_offsets_fp16_{i}",
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 200], [10], [10], False),
+            param(2, "int32", [3, 4, 5, 200], [1, 2], [2, 1], True),
+            param(3, "int64", [3, 4, 5, 150], [6, 7, 8], [6, 7, 8], False),
+            param(4, "int64", [3, 4, 5, 150], [6, 1, 8], [1, 7, 1], True),
+        ]
+    )
+    def test_jagged_jagged_elementise_add_multiple_offsets_fp16(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_prefix_shape,
+        jagged1_inner_shape,
+        jagged2_inner_shape,
+        implicit_jagged_input,
+    ):
+        self._test_jagged_jagged_elementwise_add(
+            jagged_max_prefix_shape=jagged_max_prefix_shape,
+            jagged1_inner_shape=jagged1_inner_shape,
+            jagged2_inner_shape=jagged2_inner_shape,
+            implicit_jagged_input=implicit_jagged_input,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dtype="float16",
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"multiple_offsets_fp16_{i}",
+        )
+
+    def _benchmark_jagged_dense_elementwise_add(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        num_dense_inputs: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        jagged_dim = JaggedDim(min_value=0, max_value=N)
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        sequence_dim = IntImm(value=N, name="sequence_dim")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE_INPUTS = [
+            Tensor(
+                shape=[
+                    batch_dim,
+                    sequence_dim,
+                    embedding_dim,
+                ],
+                name=f"dense_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_dense_inputs)
+        ]
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jagged_dim],
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = JAGGED
+        for DENSE in DENSE_INPUTS:
+            RESULT = ops.elementwise(FuncEnum.ADD)(RESULT, DENSE)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"benchmark_jagged_dense_elementwise_add_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            dense_inputs_pt = {
+                f"dense_{i}": get_random_torch_tensor([B, N, D], dtype)
+                for i in range(num_dense_inputs)
+            }
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs = {"source": source_pt, **dense_inputs_pt, "offsets": offsets_pt}
+            outputs = [torch.empty_like(source_pt)]
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                fused_elementwise_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("fused_elementwise")
+                ]
+                assert len(fused_elementwise_records) == 1
+                runtime_ms = fused_elementwise_records[0]["ms_per_iter"]
+
+            items = total_length * D  # total items to read / write: the jagged volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            io_num = num_dense_inputs + 2  # num_dense_inputs + 1 inputs, 1 output
+            bandwidth = io_num * items * size / (runtime_ms * 1e-3 * 1e9)  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {num_dense_inputs=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_jagged_dense_elementise_add(self):
+        self._benchmark_jagged_dense_elementwise_add(
+            B=1024,
+            N=260,
+            D=256,
+            num_dense_inputs=2,
+            dtype="float16",
+            offsets_dtype="int32",
+            use_jagged_space_indexing=False,
+            test_suffix="benchmark",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_jagged_lengths_to_offsets.py b/tests/unittest/ops/test_jagged_lengths_to_offsets.py
new file mode 100644
index 000000000..98fbb0908
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_lengths_to_offsets.py
@@ -0,0 +1,101 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_lengths_to_offsets op.
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class JaggedLengthsToOffsetsTestCase(unittest.TestCase):
+    def _test_jagged_lengths_to_offsets(
+        self,
+        batch_size: int,
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+    ):
+        LENGTHS = Tensor(
+            shape=[IntVar([1, batch_size], name="batch_size")],
+            name="lengths",
+            dtype=offsets_dtype,
+            is_input=True,
+        )
+
+        OFFSETS = ops.jagged_lengths_to_offsets()(LENGTHS)
+
+        OFFSETS._attrs["name"] = "offsets"
+        OFFSETS._attrs["is_output"] = True
+
+        model = compile_model(
+            [OFFSETS],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_lengths_to_offsets_{test_suffix}",
+        )
+
+        torch_dtype = string_to_torch_dtype(offsets_dtype)
+
+        for seed in range(10):
+            torch.manual_seed(seed)
+            lengths_pt = torch.randint(
+                low=0,
+                high=1024,
+                size=(batch_size,),
+                dtype=torch_dtype,
+            )
+            offsets_pt = torch.cat(
+                [
+                    torch.zeros((1,), dtype=torch_dtype),
+                    torch.cumsum(lengths_pt, dim=0, dtype=torch_dtype),
+                ],
+            ).cuda()
+
+            offsets = torch.empty(
+                size=(batch_size + 1,),
+                dtype=torch_dtype,
+            ).cuda()
+            model.run_with_tensors(
+                inputs={"lengths": lengths_pt.cuda()},
+                outputs=[offsets],
+            )
+
+            torch.testing.assert_close(offsets, offsets_pt)
+
+    @parameterized.expand(
+        [
+            param(1, 1, "int32"),
+            param(2, 10, "int64"),
+            param(3, 16384, "int32"),
+            param(4, 65537, "int64"),
+        ]
+    )
+    def test_jagged_lengths_to_offsets(self, i, batch_size, offsets_dtype):
+        self._test_jagged_lengths_to_offsets(
+            batch_size=batch_size,
+            offsets_dtype=offsets_dtype,
+            test_suffix=str(i),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_jagged_lengths_to_presences.py b/tests/unittest/ops/test_jagged_lengths_to_presences.py
new file mode 100644
index 000000000..c110e9f23
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_lengths_to_presences.py
@@ -0,0 +1,127 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_lengths_to_presences op.
+"""
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+def _compute_presences_pt(
+    lengths_pt: torch.Tensor,
+    max_seq_len: int,
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    data = []
+    for length in lengths_pt.cpu().tolist():
+        data.append([1] * length + [0] * (max_seq_len - length))
+    return torch.tensor(data, dtype=output_dtype)
+
+
+class JaggedLengthsToPresencesTestCase(unittest.TestCase):
+    def _test_jagged_lengths_to_presences(
+        self,
+        batch_size: int,
+        max_seq_len: int = 128,
+        lengths_dtype: str = "int32",
+        presences_dtype: str = "float16",
+        test_suffix: str = "",
+    ):
+        LENGTHS = Tensor(
+            shape=[IntVar([1, batch_size], name="batch_size")],
+            name="lengths",
+            dtype=lengths_dtype,
+            is_input=True,
+        )
+
+        PRESENCES = ops.jagged_lengths_to_presences()(
+            lengths=LENGTHS,
+            max_seq_len=max_seq_len,
+            dtype=presences_dtype,
+        )
+
+        PRESENCES._attrs["name"] = "presences"
+        PRESENCES._attrs["is_output"] = True
+
+        model = compile_model(
+            [PRESENCES],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_lengths_to_presences_{test_suffix}",
+        )
+
+        torch_lengths_dtype = string_to_torch_dtype(lengths_dtype)
+        torch_presences_dtype = string_to_torch_dtype(presences_dtype)
+
+        for seed in range(10):
+            torch.manual_seed(seed)
+            lengths_pt = torch.randint(
+                low=0,
+                high=max_seq_len,
+                size=(batch_size,),
+                dtype=torch_lengths_dtype,
+            ).cuda()
+            presences_pt = _compute_presences_pt(
+                lengths_pt=lengths_pt,
+                max_seq_len=max_seq_len,
+                output_dtype=torch_presences_dtype,
+            ).cuda()
+
+            presences = torch.empty(
+                size=(batch_size, max_seq_len),
+                dtype=torch_presences_dtype,
+            ).cuda()
+            model.run_with_tensors(
+                inputs={"lengths": lengths_pt},
+                outputs=[presences],
+            )
+
+            torch.testing.assert_close(presences, presences_pt)
+
+    @parameterized.expand(
+        [
+            param(1, 1, 1, "int32", "bool"),
+            param(2, 11, 23, "int64", "float32"),
+            param(3, 1024, 256, "int32", "float16"),
+            param(4, 1234, 567, "int64", "bool"),
+        ]
+    )
+    def test_jagged_lengths_to_presences(
+        self,
+        i,
+        batch_size,
+        max_seq_len,
+        lengths_dtype,
+        presences_dtype,
+    ):
+        self._test_jagged_lengths_to_presences(
+            batch_size=batch_size,
+            max_seq_len=max_seq_len,
+            lengths_dtype=lengths_dtype,
+            presences_dtype=presences_dtype,
+            test_suffix=str(i),
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_jagged_to_padded_dense.py b/tests/unittest/ops/test_jagged_to_padded_dense.py
new file mode 100644
index 000000000..cf774ff23
--- /dev/null
+++ b/tests/unittest/ops/test_jagged_to_padded_dense.py
@@ -0,0 +1,310 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the jagged_to_padded_dense op.
+"""
+
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import aitemplate.testing.jagged_utils as jagged_utils
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class JaggedToPaddedDenseTestCase(unittest.TestCase):
+    def _test_jagged_to_padded_dense(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+        padding_value: float = 0.0,
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        jagged_dims_max_values = jagged_max_shape[1 : 1 + len(offsets_list)]
+        jagged_dims = [
+            JaggedDim(min_value=0, max_value=max_value)
+            for max_value in jagged_dims_max_values
+        ]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+
+        jagged_inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        jagged_inner_dims = [IntImm(dim) for dim in jagged_inner_shape]
+        jagged_input_shape = [total_length] + jagged_inner_shape
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *jagged_inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=jagged_dims,
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.jagged_to_padded_dense(padding_value=padding_value)(JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not SOURCE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert not RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"test_jagged_to_padded_dense_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        source_pt = get_random_torch_tensor(jagged_input_shape, dtype)
+        result_pt = jagged_utils.jagged_to_dense(
+            jagged=source_pt,
+            offsets_list=list(offsets_pt.values()),
+            dense_shape=jagged_max_shape,
+            padding_value=padding_value,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 8], "float16", 0.0),
+            param(2, "int32", [4, 3, 4], "float16", 1e2),
+            param(3, "int32", [4, 3, 2], "float16", 0.0),
+            param(4, "int32", [4, 3, 1], "float16", 1e2),
+            param(5, "int64", [4, 3, 4], "float32", 0.0),
+            param(6, "int64", [4, 3, 2], "float32", 1e5),
+            param(7, "int64", [4, 3, 1], "float32", 1e5),
+        ]
+    )
+    def test_jagged_to_padded_dense_single_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+        padding_value,
+    ):
+        self._test_jagged_to_padded_dense(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[[0, 1, 4, 6, 7]],
+            dtype=dtype,
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"single_offsets_{dtype}_{i}",
+            padding_value=padding_value,
+        )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 8], "float16", 0.0),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], "float16", 1e2),
+            param(3, "int32", [3, 4, 5, 150, 3, 2], "float16", 0.0),
+            param(4, "int32", [3, 4, 5, 150, 1, 1], "float16", 1e2),
+            param(5, "int64", [3, 4, 5, 150, 1, 4], "float32", 0.0),
+            param(6, "int64", [3, 4, 5, 150, 3, 2], "float32", 1e5),
+            param(7, "int64", [3, 4, 5, 150, 3, 1], "float32", 1e5),
+        ]
+    )
+    def test_jagged_to_padded_dense_multiple_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+        padding_value,
+    ):
+        self._test_jagged_to_padded_dense(
+            jagged_max_shape=jagged_max_shape,
+            offsets_list=[
+                [0, 1, 3, 5],
+                [0, 2, 4, 7, 9, 10],
+                [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+            ],
+            dtype=dtype,
+            offsets_dtype=offsets_dtype,
+            test_suffix=f"multiple_offsets_{dtype}_{i}",
+            padding_value=padding_value,
+        )
+
+    def _benchmark_jagged_to_padded_dense(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        jagged_dim = JaggedDim(min_value=0, max_value=N)
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jagged_dim],
+        )(SOURCE, OFFSETS_LIST)
+
+        RESULT = ops.jagged_to_padded_dense()(JAGGED)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            f"benchmark_jagged_to_padded_dense_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            jagged_utils.generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs = {"source": source_pt, "offsets": offsets_pt}
+            outputs = [
+                torch.zeros(
+                    (B, N, D), dtype=string_to_torch_dtype(dtype), device="cuda"
+                )
+            ]
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                jagged_to_padded_dense_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("jagged_to_padded_dense")
+                ]
+                assert len(jagged_to_padded_dense_records) == 1
+                runtime_ms = jagged_to_padded_dense_records[0]["ms_per_iter"]
+
+            jagged_item = total_length * D  # total items to read: the jagged volume
+            dense_item = B * N * D  # total items to write: the dense volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            bandwidth = (
+                (jagged_item + dense_item) * size / (runtime_ms * 1e-3 * 1e9)
+            )  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_jagged_to_padded_dense(self):
+        self._benchmark_jagged_to_padded_dense(
+            B=1024,
+            N=260,
+            D=256,
+            dtype="float16",
+            offsets_dtype="int32",
+            test_suffix="benchmark",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_layernorm.py b/tests/unittest/ops/test_layernorm.py
index d05eaafa6..583f55fa6 100644
--- a/tests/unittest/ops/test_layernorm.py
+++ b/tests/unittest/ops/test_layernorm.py
@@ -24,6 +24,8 @@
 from aitemplate.compiler.base import IntImm, IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class LayernormTestCase(unittest.TestCase):
@@ -39,20 +41,26 @@ def _test_layernorm(
         beta_is_none=False,
         use_size_op=False,
         eps=1e-5,
+        atol=1e-3,
+        rtol=1e-3,
+        dtype="float16",
+        use_welford_algorithm=False,
     ):
+        torch_dtype = string_to_torch_dtype(dtype)
         BS = [1, 1024]
         input_shapes = ((BS), *MS, *NS)
         logging.info(
             f"input shapes: {input_shapes}"
             f"gamma_is_none: {gamma_is_none}, beta_is_none: {beta_is_none}, "
             f"use_size_op: {use_size_op}"
+            f"dtype: {dtype}"
         )
         assert isinstance(MS, (list, tuple))
         assert isinstance(NS, (list, tuple))
 
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=BS), *MS, *NS],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -61,7 +69,7 @@ def _test_layernorm(
         else:
             X2 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -70,7 +78,7 @@ def _test_layernorm(
         else:
             X3 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -89,20 +97,28 @@ def _test_layernorm(
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
-        target = detect_target()
+        target = detect_target(
+            layernorm_use_welford_algorithm=use_welford_algorithm,
+        )
         dll_name = f"test_{self.test_count}.so"
-        module = compile_model(X4, target, "./tmp", "layernorm", dll_name=dll_name)
+        module = compile_model(
+            X4,
+            target,
+            "./tmp",
+            f"layernorm_{dtype}",
+            dll_name=dll_name,
+        )
 
         for batch_size in [50, 900, 1024]:
-            x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+            x1_pt = torch.randn(batch_size, *MS, *NS, dtype=torch_dtype).cuda()
             if gamma_is_none:
                 x2_pt = None
             else:
-                x2_pt = torch.randn(NS).cuda().half()
+                x2_pt = torch.randn(NS, dtype=torch_dtype).cuda()
             if beta_is_none:
                 x3_pt = None
             else:
-                x3_pt = torch.randn(NS).cuda().half()
+                x3_pt = torch.randn(NS, dtype=torch_dtype).cuda()
             x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
 
             inputs = {"X": x1_pt}
@@ -110,36 +126,86 @@ def _test_layernorm(
                 inputs["gamma"] = x2_pt
             if not beta_is_none:
                 inputs["beta"] = x3_pt
-            x4 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+            x4 = torch.empty([batch_size, *MS, *NS], dtype=torch_dtype).cuda()
             module.run_with_tensors(inputs, [x4])
-            self.assertTrue(torch.allclose(x4, x4_pt, atol=1e-3, rtol=1e-3))
+            torch.testing.assert_close(x4, x4_pt, atol=atol, rtol=rtol)
             self.test_count += 1
 
-    def test_layernorm(self):
-        if detect_target().name() == "rocm":
-            self._test_layernorm(use_size_op=False, MS=(256,), NS=(768,))
-            self._test_layernorm(use_size_op=False, MS=(), NS=(768,))
-            self._test_layernorm(
-                use_size_op=False,
-                MS=(
-                    256,
-                    3,
-                ),
-                NS=(256,),
-            )
-        else:
-            for use_size_op in (True, False):
-                self._test_layernorm(use_size_op=use_size_op)
-                self._test_layernorm(gamma_is_none=True, use_size_op=use_size_op)
-                self._test_layernorm(beta_is_none=True, use_size_op=use_size_op)
+    def test_layernorm_fp16(self):
+        for use_size_op in (True, False):
+            self._test_layernorm(use_size_op=use_size_op)
+            self._test_layernorm(gamma_is_none=True, use_size_op=use_size_op)
+            self._test_layernorm(use_size_op=use_size_op, eps=0.1)
+            self._test_layernorm(MS=(16, 64), NS=(4, 32), use_size_op=use_size_op)
+
+            for use_welford_algorithm in (True, False):
                 self._test_layernorm(
-                    gamma_is_none=True, beta_is_none=True, use_size_op=use_size_op
+                    beta_is_none=True,
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
                 )
-                self._test_layernorm(use_size_op=use_size_op, eps=0.1)
-                self._test_layernorm(MS=(16, 64), NS=(4, 32), use_size_op=use_size_op)
                 self._test_layernorm(
-                    MS=(16, 8, 4), NS=(2, 4, 32), use_size_op=use_size_op
+                    gamma_is_none=True,
+                    beta_is_none=True,
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
                 )
+                self._test_layernorm(
+                    MS=(16, 8, 4),
+                    NS=(2, 4, 32),
+                    use_size_op=use_size_op,
+                    use_welford_algorithm=use_welford_algorithm,
+                )
+
+    def test_layernorm_rocm(self):
+        self._test_layernorm(use_size_op=False, MS=(256,), NS=(768,))
+        self._test_layernorm(use_size_op=False, MS=(), NS=(768,))
+        self._test_layernorm(use_size_op=False, MS=(256, 3), NS=(256,))
+
+    def test_layernorm_fp32_sm80(self):
+        self._test_layernorm(dtype="float32")
+        self._test_layernorm(gamma_is_none=True, dtype="float32")
+        self._test_layernorm(beta_is_none=True, dtype="float32")
+        self._test_layernorm(gamma_is_none=True, beta_is_none=True, dtype="float32")
+        self._test_layernorm(eps=0.1, dtype="float32")
+        self._test_layernorm(MS=(16, 64), NS=(4, 32), dtype="float32")
+
+        for use_welford_algorithm in (True, False):
+            self._test_layernorm(
+                MS=(16, 8, 4),
+                NS=(2, 4, 32),
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
+    def test_layernorm_bf16(self):
+        self._test_layernorm(dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(gamma_is_none=True, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(beta_is_none=True, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_layernorm(eps=0.1, dtype="bfloat16", atol=1e-2, rtol=1e-2)
+        self._test_layernorm(
+            MS=(16, 64), NS=(4, 32), dtype="bfloat16", atol=1e-2, rtol=1e-2
+        )
+
+        for use_welford_algorithm in (True, False):
+            self._test_layernorm(
+                MS=(16, 8, 4),
+                NS=(2, 4, 32),
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
+
+filter_test_cases_by_test_env(LayernormTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_layernorm_sigmoid_mul.py b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
index 672da2bf7..0d41ff55d 100644
--- a/tests/unittest/ops/test_layernorm_sigmoid_mul.py
+++ b/tests/unittest/ops/test_layernorm_sigmoid_mul.py
@@ -25,15 +25,15 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class FusedLayernormSigmoidMulTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(FusedLayernormSigmoidMulTestCase, self).__init__(*args, **kwargs)
-        torch.manual_seed(0)
-        self._atol = 1e-2
-        self._rtol = 1e-3
+        self._test_id = 0
 
     def _test_fused_layernorm_sigmoid_mul(
         self,
@@ -42,18 +42,23 @@ def _test_fused_layernorm_sigmoid_mul(
         gamma_is_none=False,
         beta_is_none=False,
         use_size_op=False,
+        atol=1e-2,
+        rtol=1e-2,
         eps=1e-5,
+        dtype="float16",
+        use_welford_algorithm=False,
     ):
         logging.info(
             f"_test_fused_layernorm_sigmoid_mul: M={MS}, N={NS}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}"
+            f"dtype={dtype}"
         )
         assert isinstance(MS, (list, tuple))
         assert isinstance(NS, (list, tuple))
 
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=[1, 1024]), *MS, *NS],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -62,7 +67,7 @@ def _test_fused_layernorm_sigmoid_mul(
         else:
             X2 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -71,7 +76,7 @@ def _test_fused_layernorm_sigmoid_mul(
         else:
             X3 = Tensor(
                 shape=NS,
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -91,23 +96,29 @@ def _test_fused_layernorm_sigmoid_mul(
         X6._attrs["is_output"] = True
         X6._attrs["name"] = "output"
 
-        target = detect_target()
+        target = detect_target(
+            layernorm_use_welford_algorithm=use_welford_algorithm,
+        )
         with compile_model(
-            X6, target, "./tmp", "fused_layernorm_sigmoid_mul_test"
+            X6,
+            target,
+            "./tmp",
+            f"fused_layernorm_sigmoid_mul_test_{dtype}_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch_size in [50, 900, 1024]:
                 logging.info(
                     f"Run test layernorm_sigmoid_mul. Problem size {[batch_size,] + list(MS) + list(NS)}"
                 )
-                x1_pt = torch.randn(batch_size, *MS, *NS).cuda().half()
+                x1_pt = get_random_torch_tensor([batch_size, *MS, *NS], dtype=dtype)
                 if gamma_is_none:
                     x2_pt = None
                 else:
-                    x2_pt = torch.randn(NS).cuda().half()
+                    x2_pt = get_random_torch_tensor(NS, dtype=dtype)
                 if beta_is_none:
                     x3_pt = None
                 else:
-                    x3_pt = torch.randn(NS).cuda().half()
+                    x3_pt = get_random_torch_tensor(NS, dtype=dtype)
 
                 x4_pt = torch.nn.functional.layer_norm(x1_pt, NS, x2_pt, x3_pt, eps=eps)
                 x6_pt = torch.mul(x1_pt, torch.sigmoid(x4_pt))
@@ -117,59 +128,277 @@ def _test_fused_layernorm_sigmoid_mul(
                     inputs["gamma"] = x2_pt
                 if not beta_is_none:
                     inputs["beta"] = x3_pt
-                x6 = torch.empty([batch_size, *MS, *NS]).cuda().half()
+                x6 = torch.empty_like(x6_pt)
                 module.run_with_tensors(inputs, [x6])
-                self.assertTrue(
-                    torch.allclose(x6, x6_pt, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x6 - x6_pt) if x6_pt.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x6 - x6_pt) if x6_pt.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x6, x6_pt, atol=atol, rtol=rtol),
 
-    def test_fused_layernorm_sigmoid_mul(self):
+    def test_fused_layernorm_sigmoid_mul_fp16(self):
         for eps in (1e-5, 1e-1):
             # half4 kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(1496,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="float16",
+            )
             # block_size = n kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(515,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
+                eps=eps,
+                dtype="float16",
+            )
             # block_size = 512 kernel
-            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), eps=eps)
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="float16",
+            )
 
         # test ND inputs
         eps = 1e-5
         # half4 kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(64, 8), eps=eps)
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="float16",
+        )
         # block_size = n kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(213, 2), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(3, 2), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(1, 1), eps=eps)
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 2), NS=(0, 1), eps=eps)
-        # block_size = 512 kernel
-        self._test_fused_layernorm_sigmoid_mul(MS=(2, 4), NS=(1055, 5), eps=eps)
-
         self._test_fused_layernorm_sigmoid_mul(
-            NS=(1496,), gamma_is_none=True, beta_is_none=True
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="float16",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="float16",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(1, 1),
+            eps=eps,
+            dtype="float16",
         )
         self._test_fused_layernorm_sigmoid_mul(
-            NS=(515,), gamma_is_none=True, beta_is_none=True
+            MS=(2, 2),
+            NS=(0, 1),
+            eps=eps,
+            dtype="float16",
+        )
+
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="float16",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="float16",
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float16",
         )
         for use_size_op in (True, False):
-            self._test_fused_layernorm_sigmoid_mul(NS=(1055,), use_size_op=use_size_op)
             self._test_fused_layernorm_sigmoid_mul(
                 NS=(1055,),
+                use_size_op=use_size_op,
+                dtype="float16",
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                use_size_op=use_size_op,
+                dtype="float16",
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
                 gamma_is_none=True,
+                use_size_op=use_size_op,
+                dtype="float16",
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
                 beta_is_none=True,
                 use_size_op=use_size_op,
+                dtype="float16",
+            )
+
+    def test_fused_layernorm_sigmoid_mul_fp32(self):
+        for eps in (1e-5, 1e-1):
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="float32",
+            )
+            # block_size = n kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(515,),
+                eps=eps,
+                dtype="float32",
+            )
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="float32",
+            )
+
+        # test ND inputs
+        eps = 1e-5
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="float32",
+        )
+        # block_size = n kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="float32",
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="float32",
+        )
+
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
             )
             self._test_fused_layernorm_sigmoid_mul(
-                NS=(1496,), gamma_is_none=True, use_size_op=use_size_op
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="float32",
+                use_welford_algorithm=use_welford_algorithm,
             )
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="float32",
+        )
+
+    def test_fused_layernorm_sigmoid_mul_bf16(self):
+        for eps in (1e-5, 1e-1):
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            # block_size = n kernel
             self._test_fused_layernorm_sigmoid_mul(
-                NS=(515,), beta_is_none=True, use_size_op=use_size_op
+                NS=(515,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+            )
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1055,),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
             )
 
+        # test ND inputs
+        eps = 1e-5
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(64, 8),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        # block_size = n kernel
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(213, 2),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+        self._test_fused_layernorm_sigmoid_mul(
+            MS=(2, 2),
+            NS=(3, 2),
+            eps=eps,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
+        for use_welford_algorithm in (True, False):
+            # block_size = 512 kernel
+            self._test_fused_layernorm_sigmoid_mul(
+                MS=(2, 4),
+                NS=(1055, 5),
+                eps=eps,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
+            self._test_fused_layernorm_sigmoid_mul(
+                NS=(1496,),
+                gamma_is_none=True,
+                beta_is_none=True,
+                dtype="bfloat16",
+                atol=1e-2,
+                rtol=1e-2,
+                use_welford_algorithm=use_welford_algorithm,
+            )
+
+        self._test_fused_layernorm_sigmoid_mul(
+            NS=(515,),
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype="bfloat16",
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
     # dim0 is batch size
     def _test_batch_fused_layernorm_sigmoid_mul(
-        self, M, N, gamma_is_none=False, beta_is_none=False, use_size_op=False, eps=1e-5
+        self,
+        M,
+        N,
+        gamma_is_none=False,
+        beta_is_none=False,
+        use_size_op=False,
+        eps=1e-5,
+        atol=1e-2,
+        rtol=1e-2,
+        dtype="float16",
     ):
         logging.info(
             f"_test_batch_fused_layernorm_sigmoid_mul: M={M}, N={N}, "
@@ -177,7 +406,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         )
         X1 = Tensor(
             shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(M), IntImm(N)],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -186,7 +415,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         else:
             X2 = Tensor(
                 shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -195,7 +424,7 @@ def _test_batch_fused_layernorm_sigmoid_mul(
         else:
             X3 = Tensor(
                 shape=[IntVar(name="input_batch", values=[2, 32]), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -211,25 +440,34 @@ def _test_batch_fused_layernorm_sigmoid_mul(
 
         target = detect_target()
         with compile_model(
-            X4, target, "./tmp", f"batch_fused_layernorm_sigmoid_mul_{M}_{N}_test"
+            X4,
+            target,
+            "./tmp",
+            f"batch_fused_layernorm_sigmoid_mul_{M}_{N}_test_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for batch_size in [2, 16, 32]:
                 logging.info(
-                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
-                        batch_size, M, N
-                    )
+                    f"Run test batch_layernorm_sigmoid_mul. Problem size [{batch_size}, {M}, {N}]"
                 )
-                xs_pt = [torch.randn(M, N).cuda().half() for i in range(batch_size)]
+                xs_pt = [
+                    get_random_torch_tensor([M, N], dtype=dtype)
+                    for i in range(batch_size)
+                ]
                 if gamma_is_none:
                     gammas_pt = [None] * batch_size
                 else:
                     gammas_pt = [
-                        torch.randn(N).cuda().half() for i in range(batch_size)
+                        get_random_torch_tensor([N], dtype=dtype)
+                        for i in range(batch_size)
                     ]
                 if beta_is_none:
                     betas_pt = [None] * batch_size
                 else:
-                    betas_pt = [torch.randn(N).cuda().half() for i in range(batch_size)]
+                    betas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype)
+                        for i in range(batch_size)
+                    ]
 
                 ys_pt = []
                 for i in range(batch_size):
@@ -255,17 +493,20 @@ def _test_batch_fused_layernorm_sigmoid_mul(
                     inputs["gamma"] = gamma_pt
                 if not beta_is_none:
                     inputs["beta"] = beta_pt
-                x4 = torch.empty([batch_size, M, N]).cuda().half()
+                x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
-                self.assertTrue(
-                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x4, y_t, atol=atol, rtol=rtol)
 
     # dim1 is the batch size
     def _test_batch_fused_layernorm_sigmoid_mul_dim1(
-        self, B, N, gamma_is_none=False, beta_is_none=False
+        self,
+        B,
+        N,
+        gamma_is_none=False,
+        beta_is_none=False,
+        atol=1e-2,
+        rtol=1e-2,
+        dtype="float16",
     ):
         logging.info(
             f"_test_batch_fused_layernorm_sigmoid_mul_dim1: M={B}, N={N}, "
@@ -277,7 +518,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                 IntVar(name="input_batch", values=[128, 1024]),
                 IntImm(N),
             ],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
@@ -286,7 +527,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
         else:
             X2 = Tensor(
                 shape=[IntImm(B), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="gamma",
                 is_input=True,
             )
@@ -295,7 +536,7 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
         else:
             X3 = Tensor(
                 shape=[IntImm(B), IntImm(N)],
-                dtype="float16",
+                dtype=dtype,
                 name="beta",
                 is_input=True,
             )
@@ -308,23 +549,26 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
             X4,
             target,
             "./tmp",
-            f"batch_fused_layernorm_sigmoid_mul_dim1_{B}_{N}_test",
+            f"batch_fused_layernorm_sigmoid_mul_dim1_{B}_{N}_test_{self._test_id}",
         ) as module:
+            self._test_id += 1
             for M in [128, 1024]:
                 logging.info(
-                    "Run test batch_layernorm_sigmoid_mul. Problem size [{}, {}, {}]".format(
-                        B, M, N
-                    )
+                    f"Run test batch_layernorm_sigmoid_mul. Problem size [{B}, {M}, {N}]"
                 )
-                xs_pt = [torch.randn(M, N).cuda().half() for i in range(B)]
+                xs_pt = [get_random_torch_tensor([M, N], dtype=dtype) for i in range(B)]
                 if gamma_is_none:
                     gammas_pt = [None] * B
                 else:
-                    gammas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+                    gammas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype) for i in range(B)
+                    ]
                 if beta_is_none:
                     betas_pt = [None] * B
                 else:
-                    betas_pt = [torch.randn(N).cuda().half() for i in range(B)]
+                    betas_pt = [
+                        get_random_torch_tensor([N], dtype=dtype) for i in range(B)
+                    ]
 
                 ys_pt = []
                 for i in range(B):
@@ -346,31 +590,59 @@ def _test_batch_fused_layernorm_sigmoid_mul_dim1(
                     inputs["gamma"] = gamma_pt
                 if not beta_is_none:
                     inputs["beta"] = beta_pt
-                x4 = torch.empty([B, M, N]).cuda().half()
+                x4 = torch.empty_like(y_t)
                 module.run_with_tensors(inputs, [x4])
-                self.assertTrue(
-                    torch.allclose(x4, y_t, atol=self._atol, rtol=self._rtol),
-                    f"max diff: {torch.max(x4 - y_t) if y_t.numel() > 0 else 0}, "
-                    f"min diff: {torch.min(x4 - y_t) if y_t.numel() > 0 else 0}",
-                )
+                torch.testing.assert_close(x4, y_t, atol=atol, rtol=rtol)
 
-    def test_batch_fused_layernorm_sigmoid_mul(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+            param("bfloat16"),
+        ]
+    )
+    def test_batch_fused_layernorm_sigmoid_mul(self, dtype: str):
         for eps in (1e-5, 1e-1):
-            self._test_batch_fused_layernorm_sigmoid_mul(512, 1024, eps=eps)
-            self._test_batch_fused_layernorm_sigmoid_mul(512, 64, eps=eps)
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512,
+                1024,
+                eps=eps,
+                dtype=dtype,
+            )
+            self._test_batch_fused_layernorm_sigmoid_mul(
+                512,
+                64,
+                eps=eps,
+                dtype=dtype,
+            )
 
         self._test_batch_fused_layernorm_sigmoid_mul(
-            512, 1024, gamma_is_none=True, beta_is_none=True
+            512,
+            1024,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         self._test_batch_fused_layernorm_sigmoid_mul(
-            512, 64, gamma_is_none=True, beta_is_none=True
+            512,
+            64,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         for use_size_op in (True, False):
             self._test_batch_fused_layernorm_sigmoid_mul(
-                1024, 1055, use_size_op=use_size_op, eps=1e-1
+                1024,
+                1055,
+                use_size_op=use_size_op,
+                eps=1e-1,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                1024, 1055, use_size_op=use_size_op
+                1024,
+                1055,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
                 1024,
@@ -378,22 +650,47 @@ def test_batch_fused_layernorm_sigmoid_mul(self):
                 gamma_is_none=True,
                 beta_is_none=True,
                 use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                512, 1024, gamma_is_none=True, use_size_op=use_size_op
+                512,
+                1024,
+                gamma_is_none=True,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
             self._test_batch_fused_layernorm_sigmoid_mul(
-                512, 1024, beta_is_none=True, use_size_op=use_size_op
+                512,
+                1024,
+                beta_is_none=True,
+                use_size_op=use_size_op,
+                dtype=dtype,
             )
 
-        self._test_batch_fused_layernorm_sigmoid_mul_dim1(1, 512)
-        self._test_batch_fused_layernorm_sigmoid_mul_dim1(16, 512)
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            1,
+            512,
+            dtype=dtype,
+        )
+        self._test_batch_fused_layernorm_sigmoid_mul_dim1(
+            16,
+            512,
+            dtype=dtype,
+        )
 
         self._test_batch_fused_layernorm_sigmoid_mul_dim1(
-            1, 512, gamma_is_none=True, beta_is_none=True
+            1,
+            512,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
         self._test_batch_fused_layernorm_sigmoid_mul_dim1(
-            16, 512, gamma_is_none=True, beta_is_none=True
+            16,
+            512,
+            gamma_is_none=True,
+            beta_is_none=True,
+            dtype=dtype,
         )
 
     def _test_group_fused_layernorm_sigmoid_mul(
@@ -405,16 +702,20 @@ def _test_group_fused_layernorm_sigmoid_mul(
         use_size_op=False,
         eps=1e-5,
         fuse_sigmoid_mul=True,
+        atol=1e-2,
+        rtol=1e-2,
+        dtype="float16",
     ):
         testname = (
-            "group_fused_layernorm_sigmoid_mul_test"
+            f"group_fused_layernorm_sigmoid_mul_test_{dtype}_{self._test_id}"
             if fuse_sigmoid_mul
-            else "group_layernorm_test"
+            else f"group_layernorm_test_{dtype}_{self._test_id}"
         )
+        self._test_id += 1
         logging.info(
             f"{testname}: input_shapes={input_shapes}, "
             f"gamma_is_none={gamma_is_none}, beta_is_none={beta_is_none}, "
-            f"use_size_op={use_size_op}"
+            f"use_size_op={use_size_op}, dtype={dtype}"
         )
         inputs = []
         gammas = []
@@ -425,7 +726,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
             inputs.append(
                 Tensor(
                     shape=[IntImm(n) for n in shape],
-                    dtype="float16",
+                    dtype=dtype,
                     name="X_" + str(i),
                     is_input=True,
                 )
@@ -435,7 +736,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
                 if gamma_is_none
                 else Tensor(
                     shape=[IntImm(n) for n in shape[batch_ndim:]],
-                    dtype="float16",
+                    dtype=dtype,
                     name="gamma_" + str(i),
                     is_input=True,
                 )
@@ -446,7 +747,7 @@ def _test_group_fused_layernorm_sigmoid_mul(
                 if beta_is_none
                 else Tensor(
                     shape=[IntImm(n) for n in shape[batch_ndim:]],
-                    dtype="float16",
+                    dtype=dtype,
                     name="beta_" + str(i),
                     is_input=True,
                 )
@@ -489,14 +790,18 @@ def _test_group_fused_layernorm_sigmoid_mul(
             gammas_pt = []
             betas_pt = []
             for shape in input_shapes:
-                xs_pt.append(torch.randn(shape).cuda().half())
+                xs_pt.append(get_random_torch_tensor(shape, dtype=dtype))
                 norm_shape = shape[batch_ndim:]
                 gamma_pt = (
-                    None if gamma_is_none else torch.randn(norm_shape).cuda().half()
+                    None
+                    if gamma_is_none
+                    else get_random_torch_tensor(norm_shape, dtype=dtype)
                 )
                 gammas_pt.append(gamma_pt)
                 beta_pt = (
-                    None if beta_is_none else torch.randn(norm_shape).cuda().half()
+                    None
+                    if beta_is_none
+                    else get_random_torch_tensor(norm_shape, dtype=dtype)
                 )
                 betas_pt.append(beta_pt)
 
@@ -531,24 +836,33 @@ def _test_group_fused_layernorm_sigmoid_mul(
             # module.benchmark_with_tensors(inputs, outputs)
 
         for i in range(B):
-            logging.debug("output: {}".format(str(i)))
+            logging.debug(f"output: {i}")
             y = outputs[i]
-            self.assertTrue(
-                torch.allclose(ys_pt[i], y, atol=self._atol, rtol=self._rtol),
-                f"max diff: {torch.max(ys_pt[i]- y) if y.numel() > 0 else 0}, "
-                f"min diff: {torch.min(ys_pt[i] - y) if y.numel() > 0 else 0}",
-            )
+            torch.testing.assert_close(ys_pt[i], y, atol=atol, rtol=rtol)
 
-    def test_group_fused_layernorm_sigmoid_mul(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+            param("bfloat16"),
+        ]
+    )
+    def test_group_fused_layernorm_sigmoid_mul(self, dtype: str):
         # half4 kernel
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128]], eps=1e-1
+            [[1024, 256], [1024, 128]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128]], use_size_op=False
+            [[1024, 256], [1024, 128]],
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, use_size_op=True
+            [[1024, 256]] * 4,
+            use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [
@@ -558,109 +872,160 @@ def test_group_fused_layernorm_sigmoid_mul(self):
                 [1024, 256],
                 [1024, 128],
                 [1024, 256],
-            ]
+            ],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [
                 [2048, 2048],
                 [2048, 1024],
-            ]
+            ],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 256], [1024, 128]],
             gamma_is_none=True,
             beta_is_none=True,
             use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=False
+            [[1024, 256]] * 4,
+            gamma_is_none=True,
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256]] * 4, gamma_is_none=True, use_size_op=True
+            [[1024, 256]] * 4,
+            gamma_is_none=True,
+            use_size_op=True,
+            dtype=dtype,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+            ],
+            dtype=dtype,
         )
 
         # Make sure we test the boundary between being able to fit the arguments in constant memory vs not.
         for num_groups in range(38, 41):
             self._test_group_fused_layernorm_sigmoid_mul(
-                [[1024, 256]] * num_groups, use_size_op=True
+                [[1024, 256]] * num_groups,
+                use_size_op=True,
+                dtype=dtype,
             )
 
         # < 1024 kernel
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 64], [1024, 256], [1024, 125]], eps=1e-1
+            [[1024, 64], [1024, 256], [1024, 125]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 64], [1024, 256], [1024, 125]]
+            [[1024, 64], [1024, 256], [1024, 125]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             gamma_is_none=True,
             beta_is_none=True,
             use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             beta_is_none=True,
             use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
             beta_is_none=True,
             use_size_op=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1], [1, 0], [1, 1]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 256], [1024, 128], [1024, 0]]
+            [[1024, 256], [1024, 128], [1024, 0]],
+            dtype=dtype,
         )
 
         # fallback kernel
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 1025], [1024, 1276], [1024, 1023]], eps=1e-1
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            eps=1e-1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[1024, 1025], [1024, 1276], [1024, 1023]]
+            [[1024, 1025], [1024, 1276], [1024, 1023]],
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 1025], [1024, 1276], [1024, 1023]],
             gamma_is_none=True,
             beta_is_none=True,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[128, 1025], [128, 0], [128, 1023]]
+            [[128, 1025], [128, 0], [128, 1023]],
+            dtype=dtype,
         )
         # Ditto boundary test
         for num_groups_divided_by_3 in range(12, 15):
             self._test_group_fused_layernorm_sigmoid_mul(
-                [[1024, 1025], [1024, 1276], [1024, 1023]] * num_groups_divided_by_3
+                [[1024, 1025], [1024, 1276], [1024, 1023]] * num_groups_divided_by_3,
+                dtype=dtype,
             )
 
         # ND
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[2, 512, 256, 16], [2, 512, 128, 4]], 2, use_size_op=False
+            [[2, 512, 256, 16], [2, 512, 128, 4]],
+            2,
+            use_size_op=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
-            [[3, 256, 64], [3, 256, 256], [3, 256, 125]], 1
+            [[3, 256, 64], [3, 256, 256], [3, 256, 125]],
+            1,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16, 3, 1025], [4, 16, 2, 1276], [4, 16, 1, 1023]],
             2,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[4, 16, 1025], [4, 16, 1276], [4, 16, 1023]],
             1,
             gamma_is_none=True,
             beta_is_none=True,
+            dtype=dtype,
         )
 
-    def test_group_layernorm(self):
+    @parameterized.expand(
+        [
+            param("float16"),
+            param("float32"),
+            param("bfloat16"),
+        ]
+    )
+    def test_group_layernorm(self, dtype: str):
         self._test_group_fused_layernorm_sigmoid_mul(
             [
                 [1024, 256],
@@ -671,6 +1036,18 @@ def test_group_layernorm(self):
                 [1024, 256],
             ],
             fuse_sigmoid_mul=False,
+            dtype=dtype,
+        )
+        self._test_group_fused_layernorm_sigmoid_mul(
+            [
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+                [330, 200, 48],
+            ],
+            fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 64], [1024, 256], [1024, 125]],
@@ -678,23 +1055,28 @@ def test_group_layernorm(self):
             beta_is_none=True,
             use_size_op=True,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1024, 1025], [1024, 1276], [1024, 1023]],
             eps=1e-1,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[1, 1], [1, 0], [1, 1]],
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
         self._test_group_fused_layernorm_sigmoid_mul(
             [[2, 512, 256, 16], [2, 512, 128, 4]],
             2,
             use_size_op=False,
             fuse_sigmoid_mul=False,
+            dtype=dtype,
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_make_jagged.py b/tests/unittest/ops/test_make_jagged.py
new file mode 100644
index 000000000..4c91d3d5d
--- /dev/null
+++ b/tests/unittest/ops/test_make_jagged.py
@@ -0,0 +1,499 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim, JaggedIntVar
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.jagged_utils import add_jagged_dense_ref
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.graph_utils import get_sorted_ops
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+
+
+class MakeJaggedTestCase(unittest.TestCase):
+    def _test_make_jagged(
+        self,
+        check_sequence_lengths=True,
+        test_name="make_jagged",
+    ):
+        offsets1 = Tensor(
+            shape=[
+                IntVar(values=[1, 16]),
+            ],
+            name="off1",
+            dtype="int32",
+            is_input=True,
+        )
+        offsets2 = Tensor(
+            shape=[
+                IntVar(values=[1, 16]),
+            ],
+            name="off2",
+            dtype="int32",
+            is_input=True,
+        )
+
+        X = Tensor(
+            shape=[
+                IntVar(values=[1, 1024]),
+                IntImm(value=128),
+            ],
+            name="X",
+            dtype="float16",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[
+                IntImm(value=128),
+                IntImm(value=64),
+            ],
+            name="W",
+            dtype="float16",
+            is_input=True,
+        )
+
+        batch_dim = IntVar(values=[1, 128])
+        jd0 = JaggedDim(min_value=0, max_value=2)
+        jd1 = JaggedDim(min_value=0, max_value=3)
+        Y = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[jd0, jd1],
+            check_sequence_lengths=check_sequence_lengths,
+        )(X, [offsets1, offsets2])
+        Z = ops.gemm_rrr()(Y, W)
+
+        assert Y.is_jagged()
+        assert Z.is_jagged()
+
+        Y_dim_0 = Y._attrs["shape"][0]
+        assert isinstance(Y_dim_0, JaggedIntVar)
+        assert Y_dim_0.jagged_dims() == [jd0, jd1]
+        assert jd0.offsets() == offsets1
+        assert jd1.offsets() == offsets2
+
+        Z_dim_0 = Z._attrs["shape"][0]
+        assert Z_dim_0 == Y_dim_0
+
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+        Z._attrs["name"] = "Z"
+        Z._attrs["is_output"] = True
+
+        model = compile_model([Y, Z], detect_target(), "./tmp", test_name)
+
+        offsets1_pt = torch.tensor([0, 1, 3, 5], dtype=torch.int32).cuda()
+        offsets2_pt = torch.tensor([0, 2, 4, 4, 7, 10], dtype=torch.int32).cuda()
+
+        if not check_sequence_lengths:
+            # extend seq lens beyond the JaggedDim bounds
+            offsets1_pt[2] = 4
+            offsets2_pt[4] = 9
+
+        x_pt = get_random_torch_tensor([10, 128], "float16")
+        w_pt = get_random_torch_tensor([128, 64], "float16")
+        z_pt = torch.matmul(x_pt, w_pt)
+
+        y = get_torch_empty_tensor([10, 128], "float16")
+        z = get_torch_empty_tensor([10, 64], "float16")
+
+        inputs = {"X": x_pt, "off1": offsets1_pt, "off2": offsets2_pt, "W": w_pt}
+        model.run_with_tensors(inputs, [y, z])
+
+        torch.testing.assert_close(y, x_pt)
+        torch.testing.assert_close(z, z_pt)
+
+    def test_make_jagged(self):
+        self._test_make_jagged(
+            check_sequence_lengths=True,
+            test_name="test_make_jagged",
+        )
+
+    def test_make_jagged_no_seq_len_check(self):
+        self._test_make_jagged(
+            check_sequence_lengths=False,
+            test_name="test_make_jagged_no_seq_len_check",
+        )
+
+    def test_make_jagged_with_dynamic_bounds(
+        self,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N_min = 1
+        N_max = 32
+        N = 3
+        D = 64
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntVar(name="max_seq_len", values=[N_min, N_max])
+        embedding_dim = IntImm(name="embedding", value=D)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N_max])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGED = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(
+                    min_value=0,
+                    max_value=max_seq_dim,
+                )
+            ],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, DENSE)
+
+        assert not SOURCE.is_jagged()
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert RESULT.is_jagged()
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_with_dynamic_bounds",
+        )
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        source_pt = get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+        dense_pt = get_random_torch_tensor([B, N, D], dtype=dtype)
+
+        result_pt = add_jagged_dense_ref(
+            jagged=source_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, D],
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {"source": source_pt, "offsets": offsets_pt, "dense": dense_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    def test_make_jagged_multiple_sources(
+        self,
+        num_sources=3,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N = 3
+        D = 64
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntImm(name="max_seq_len", value=N)
+        embedding_dim = IntImm(name="embedding", value=D)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+
+        SOURCES = [
+            Tensor(
+                shape=[
+                    total_length_dim,
+                    embedding_dim,
+                ],
+                name=f"source_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_sources)
+        ]
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        JAGGEDS = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[
+                JaggedDim(
+                    min_value=0,
+                    max_value=max_seq_dim,
+                )
+            ],
+        )(
+            source=SOURCES,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = DENSE
+        for JAGGED in JAGGEDS:
+            RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, RESULT)
+
+        assert all(not SOURCE.is_jagged() for SOURCE in SOURCES)
+        assert not DENSE.is_jagged()
+        assert all(JAGGED.is_jagged() for JAGGED in JAGGEDS)
+        assert RESULT.is_jagged()
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_multiple_sources",
+        )
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        sources_pt = {
+            f"source_{i}": get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+            for i in range(num_sources)
+        }
+        dense_pt = get_random_torch_tensor([B, N, D], dtype=dtype)
+
+        sources_list_pt = list(sources_pt.values())
+        summed_sources_pt = torch.clone(sources_list_pt[0])
+        for source_pt in sources_list_pt[1:]:
+            summed_sources_pt += source_pt
+        result_pt = add_jagged_dense_ref(
+            jagged=summed_sources_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, D],
+            dense=dense_pt,
+        )
+        result = torch.empty_like(result_pt)
+
+        inputs = {**sources_pt, "offsets": offsets_pt, "dense": dense_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt, rtol=1e-2, atol=1e-2)
+
+    def test_make_jagged_dedup(
+        self,
+        dtype="float16",
+        offsets_dtype="int32",
+    ):
+        B = 4
+        N = 3
+        D = 64
+        W = 32
+
+        batch_dim = IntVar(name="batch_size", values=[1, B])
+        max_seq_dim = IntImm(name="max_seq_len", value=N)
+        embedding_dim = IntImm(name="embedding", value=D)
+        weights_dim = IntImm(name="weight", value=W)
+
+        total_length_dim = IntVar(name="total_length", values=[0, B * N])
+        offsets_dim = IntVar(name="offsets_size", values=[2, B + 1])
+        jagged_dims = [JaggedDim(min_value=0, max_value=max_seq_dim)]
+        num_sources = 4
+
+        X1, X2, X3, X4 = [
+            Tensor(
+                shape=[
+                    total_length_dim,
+                    embedding_dim,
+                ],
+                name=f"x_{i}",
+                dtype=dtype,
+                is_input=True,
+            )
+            for i in range(num_sources)
+        ]
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                max_seq_dim,
+                weights_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        WEIGHTS = Tensor(
+            shape=[
+                embedding_dim,
+                weights_dim,
+            ],
+            name="weights",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        Y1, Y2 = (
+            ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+                source=SOURCE,
+                offsets_list=OFFSETS_LIST,
+            )
+            for SOURCE in (X1, X2)
+        )
+        Y3, Y4 = (ops.gemm_rrr()(SOURCE, WEIGHTS) for SOURCE in (X3, X4))
+        Z1, Z2 = (ops.gemm_rrr()(SOURCE, WEIGHTS) for SOURCE in (Y1, Y2))
+        Z3, Z4 = (
+            ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+                source=SOURCE,
+                offsets_list=OFFSETS_LIST,
+            )
+            for SOURCE in (Y3, Y4)
+        )
+        RESULT = DENSE
+        for Z in (Z1, Z2, Z3, Z4):
+            RESULT = ops.elementwise(FuncEnum.ADD)(RESULT, Z)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        for X in (X1, X2, X3, X4):
+            assert not X.is_jagged()
+        assert Y1.is_jagged()
+        assert Y2.is_jagged()
+        assert not Y3.is_jagged()
+        assert not Y4.is_jagged()
+        for Z in (Z1, Z2, Z3, Z4):
+            assert Z.is_jagged()
+        assert not DENSE.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(),
+            "./tmp",
+            "test_make_jagged_dedup",
+        )
+
+        make_jagged_ops = [
+            op
+            for op in get_sorted_ops(model.debug_sorted_graph)
+            if op._attrs["op"] == "make_jagged"
+        ]
+        assert len(make_jagged_ops) == 1
+        make_jagged_inputs = set(make_jagged_ops[0]._attrs["inputs"])
+        assert make_jagged_ops[0]._attrs["num_sources"] == num_sources
+        for X in (X1, X2, X3, X4):
+            assert not X.is_jagged()
+            assert X in make_jagged_inputs
+        assert OFFSETS_LIST[0] in make_jagged_inputs
+        for Y in (Y1, Y2, Y3, Y4):
+            assert Y.is_jagged()
+        for Z in (Z1, Z2, Z3, Z4):
+            assert Z.is_jagged()
+        assert not DENSE.is_jagged()
+        assert RESULT.is_jagged()
+
+        offsets = [0, 1, 4, 6, 7]
+        torch_offsets_type = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = torch.tensor(offsets, dtype=torch_offsets_type).cuda()
+        xs_pt = {
+            f"x_{i}": get_random_torch_tensor([offsets[-1], D], dtype=dtype)
+            for i in range(num_sources)
+        }
+        weights_pt = get_random_torch_tensor([D, W], dtype=dtype)
+        dense_pt = get_random_torch_tensor([B, N, W], dtype=dtype)
+
+        ys_pt = [torch.matmul(x_pt, weights_pt) for x_pt in xs_pt.values()]
+        summed_ys_pt = torch.clone(ys_pt[0])
+        for y_pt in ys_pt[1:]:
+            summed_ys_pt += y_pt
+        result_pt = add_jagged_dense_ref(
+            jagged=summed_ys_pt,
+            offsets_list=[offsets_pt],
+            jagged_max_shape=[B, N, W],
+            dense=dense_pt,
+        )
+
+        inputs = {
+            **xs_pt,
+            "offsets": offsets_pt,
+            "dense": dense_pt,
+            "weights": weights_pt,
+        }
+        result = torch.empty_like(result_pt)
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt, rtol=5e-2, atol=5e-2)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_masked_select.py b/tests/unittest/ops/test_masked_select.py
new file mode 100644
index 000000000..22192328a
--- /dev/null
+++ b/tests/unittest/ops/test_masked_select.py
@@ -0,0 +1,235 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for masked_select Operator.
+"""
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.benchmark_pt import benchmark_torch_function
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import parameterized
+
+
+@unittest.skipIf(
+    detect_target().name() == "rocm", "masked_select is not implemented for ROCm"
+)
+class maskedSelectTestCase(unittest.TestCase):
+    def _test_masked_select(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        test_name="masked_select",
+        copy_op=False,
+        dtype="float16",
+        zero_mask=False,
+        benchmark=False,
+    ):
+        X1 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=shape,
+            dtype="bool",
+            name="mask",
+            is_input=True,
+        )
+        X4_op = ops.masked_select()
+        if copy_op:
+            X4_op = ops.masked_select(**X4_op._get_op_attributes())
+
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        if zero_mask:
+            mask = torch.zeros_like(x)
+        else:
+            mask = get_random_torch_tensor(shape, dtype="float16") > 0
+        y_pt = torch.masked_select(x, mask)
+        y = torch.empty((x.numel(),), dtype=x.dtype, device=x.device)
+        y_ait = module.run_with_tensors([x, mask], [y])["output_values"]
+        # y_ait contains the correct result. It points to the same memory blob as y, but has the correct shape
+        self.assertTrue(torch.allclose(y_pt, y_ait, atol=1e-10, rtol=0))
+        # y retained the original shape (x.numel(),), so needs to be cut before comparison
+        self.assertTrue(torch.allclose(y_pt, y[: y_ait.shape[0]], atol=1e-10, rtol=0))
+
+        if benchmark:
+            print(f"Benchmarking with shape={shape}, dtype={dtype}")
+            # Warm up.
+            for _ in range(5):
+                module.run_with_tensors([x, mask], [y])
+            # Benchmark.
+            num_benchmark_iter = 1000
+
+            time_per_iter_ms, time_std, _ = module.benchmark_with_tensors(
+                [x, mask], [y], count=num_benchmark_iter
+            )
+            print(f"AITemplate time: {time_per_iter_ms:.2f}ms")
+
+            func = torch.masked_select
+            args = (x, mask)
+            # Warm up.
+            for _ in range(5):
+                func(*args)
+            # Benchmark.
+            torch_time_per_iter_ms = benchmark_torch_function(
+                num_benchmark_iter, func, *args
+            )
+            print(f"PyTorch time: {torch_time_per_iter_ms:.2f}ms")
+
+            print(f"Speedup: {torch_time_per_iter_ms / time_per_iter_ms:.2f}x")
+
+    @parameterized.expand(
+        [
+            [(2, 6), False],
+            [(20, 6), False],
+            [(300, 80), False],
+            # Uncomment to benchmark
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp16(self, shape, benchmark):
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_fp16",
+            dtype="float16",
+            benchmark=benchmark,
+        )
+        if not benchmark:
+            self._test_masked_select(
+                shape=shape,
+                test_name="masked_select_fp16_copy_op",
+                copy_op=True,
+                dtype="float16",
+                benchmark=benchmark,
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float32 not supported in ROCm")
+    @parameterized.expand(
+        [
+            [(2, 6), False],
+            [(20, 6), False],
+            [(300, 80), False],
+            # Uncomment to benchmark
+            # [(300, 80), True],
+            # [(1024, 128, 256), True],
+            # [(1024, 1024, 100), True],
+            # [(1, 1), True],
+            # [(10, 1), True],
+            # [(100, 1), True],
+            # [(1000, 1), True],
+            # [(10000, 1), True],
+            # [(100000, 1), True],
+            # [(1000000, 1), True],
+            # [(10000000, 1), True],
+            # [(100000000, 1), True],
+            # [(10000, 10000), True],
+            # [(10, 10, 10, 10, 10, 10, 10, 10), True],
+        ]
+    )
+    def test_fp32(self, shape, benchmark):
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_fp32",
+            dtype="float32",
+            benchmark=benchmark,
+        )
+        if not benchmark:
+            self._test_masked_select(
+                shape=shape,
+                test_name="masked_select_fp32_copy_op",
+                copy_op=True,
+                dtype="float32",
+                benchmark=benchmark,
+            )
+
+    def test_input_dynamic_shape(
+        self,
+        batch_size=1,
+        shape=(2, 6),
+        test_name="masked_select_dynamic",
+        dtype="float16",
+        benchmark=False,
+    ):
+        """
+        Check that dynamic input shape is handled correctly.
+        """
+        dyn_shape = (IntVar(values=(1, 10)), IntVar(values=(1, 10)))
+        X1 = Tensor(
+            shape=dyn_shape,
+            dtype=dtype,
+            name="x",
+            is_input=True,
+        )
+        X2 = Tensor(
+            shape=dyn_shape,
+            dtype="bool",
+            name="mask",
+            is_input=True,
+        )
+        X4_op = ops.masked_select()
+        X4 = X4_op(X1, X2)
+        X4._attrs["is_output"] = True
+        X4._attrs["name"] = "output_values"
+
+        target = detect_target()
+        module = compile_model([X4], target, "./tmp", test_name)
+
+        x = get_random_torch_tensor(shape, dtype=dtype)
+        mask = get_random_torch_tensor(shape, dtype="float16") > 0
+        y_pt = torch.masked_select(x, mask)
+        y = torch.empty((x.numel(),), dtype=x.dtype, device=x.device)
+        y_ait = module.run_with_tensors([x, mask], [y])["output_values"]
+        # y_ait contains the correct result. It points to the same memory blob as y, but has the correct shape
+        self.assertTrue(torch.allclose(y_pt, y_ait, atol=1e-10, rtol=0))
+        # y retained the original shape (x.numel(),), so needs to be cut before comparison
+        self.assertTrue(torch.allclose(y_pt, y[: y_ait.shape[0]], atol=1e-10, rtol=0))
+
+    def test_empty_output(self, shape=(2, 6)):
+        """
+        The case when the mask is zero and the output is an empty tensor.
+        """
+        self._test_masked_select(
+            shape=shape,
+            test_name="masked_select_zero_mask",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(1024)
+    unittest.main()
diff --git a/tests/unittest/ops/test_max_pool2d.py b/tests/unittest/ops/test_max_pool2d.py
index f605d810d..b1f16601a 100644
--- a/tests/unittest/ops/test_max_pool2d.py
+++ b/tests/unittest/ops/test_max_pool2d.py
@@ -19,15 +19,16 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class MaxPool2dTestCase(unittest.TestCase):
-    def test_max_pool_2d_fp16(self):
+    def _test_max_pool_2d(self, dtype="float16"):
         batch_size = [1, 3]
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 112, 112, 64],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -37,15 +38,23 @@ def test_max_pool_2d_fp16(self):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "max_pool2d")
         for batch in batch_size:
-            X_pt = torch.randn(batch, 64, 112, 112).cuda().half()
+            X_pt = get_random_torch_tensor([batch, 64, 112, 112], dtype=dtype)
             OP_pt = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
             Y_pt = OP_pt(X_pt)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty([batch, 56, 56, 64]).cuda().half()
+            y = torch.empty_like(Y_pt).permute(0, 2, 3, 1).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
+    def test_max_pool_2d_fp16(self):
+        self._test_max_pool_2d(dtype="float16")
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_max_pool_2d_fp32(self):
+        self._test_max_pool_2d(dtype="float32")
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_max_pool3d.py b/tests/unittest/ops/test_max_pool3d.py
new file mode 100644
index 000000000..79be2b634
--- /dev/null
+++ b/tests/unittest/ops/test_max_pool3d.py
@@ -0,0 +1,107 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.pool3d import MaxPool3d
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+class MaxPool3dTestCase(unittest.TestCase):
+    def _test_max_pool_3d(
+        self,
+        kernel_size,
+        stride,
+        padding,
+        pt_input_shape,
+        ait_input_shape,
+        dtype="float16",
+    ):
+        X_pt = get_random_torch_tensor(pt_input_shape, dtype=dtype)
+        OP_pt = (
+            torch.nn.MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+            .cuda()
+            .half()
+        )
+        Y_pt = OP_pt(X_pt)
+        X_ait = Tensor(
+            shape=ait_input_shape,
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        OP_ait = MaxPool3d(kernel_size=kernel_size, stride=stride, padding=padding)
+        Y_ait = OP_ait(X_ait)
+
+        Y_ait._attrs["name"] = "output_0"
+        Y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y_ait, target, "./tmp", "max_pool3d")
+
+        x = X_pt.permute((0, 2, 3, 4, 1)).contiguous()
+        y = torch.empty_like(Y_pt).permute(0, 2, 3, 4, 1).contiguous()
+        module.run_with_tensors([x], [y])
+        y_transpose = y.permute((0, 4, 1, 2, 3))
+
+        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_max_pool_3d_fp16(self):
+        for batch in [1, 3]:
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float16",
+            )
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float16",
+            )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_max_pool_3d_fp32(self):
+        for batch in [1, 3]:
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float32",
+            )
+            self._test_max_pool_3d(
+                kernel_size=(1, 3, 3),
+                stride=(1, 2, 2),
+                padding=(0, 1, 1),
+                pt_input_shape=[batch, 4, 8, 256, 256],
+                ait_input_shape=[batch, 8, 256, 256, 4],
+                dtype="float32",
+            )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_ndhwc3to8.py b/tests/unittest/ops/test_ndhwc3to8.py
new file mode 100644
index 000000000..dc13948bb
--- /dev/null
+++ b/tests/unittest/ops/test_ndhwc3to8.py
@@ -0,0 +1,57 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import numpy as np
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Ndhcw3To8TestCase(unittest.TestCase):
+    def test_ndhcw3to8_fp16(self):
+        target = detect_target()
+        batch_size = [1, 3]
+        if target.name() == "rocm":
+            return True
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), 4, 224, 224, 3],
+            dtype="float16",
+            name="input_0",
+            is_input=True,
+        )
+        OP = nn.Ndhwc3to8()
+        Y = OP(X)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", "ndhwc3to8")
+        for batch in batch_size:
+            X_np = np.random.uniform(-1, 1, (batch, 4, 224, 224, 3)).astype("float16")
+            Y_np = np.zeros((batch, 4, 224, 224, 8)).astype("float16")
+            Y_np[:, :, :, :, 0] = X_np[:, :, :, :, 0]
+            Y_np[:, :, :, :, 1] = X_np[:, :, :, :, 1]
+            Y_np[:, :, :, :, 2] = X_np[:, :, :, :, 2]
+            Y_pt = torch.from_numpy(Y_np).cuda()
+            X_pt = torch.from_numpy(X_np).cuda()
+            y = torch.empty([batch, 4, 224, 224, 8]).cuda().half()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/unittest/ops/test_nhwc3to4.py b/tests/unittest/ops/test_nhwc3to4.py
index d0dbbe402..76e910c72 100644
--- a/tests/unittest/ops/test_nhwc3to4.py
+++ b/tests/unittest/ops/test_nhwc3to4.py
@@ -24,14 +24,14 @@
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Nhcw3To4TestCase(unittest.TestCase):
-    def test_nhcw3to8_fp16(self):
+    def _test_nhcw3to4(self, dtype="float16"):
         target = detect_target()
         batch_size = [1, 3]
         if target.name() == "rocm":
             return True
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), 224, 224, 3],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -41,17 +41,23 @@ def test_nhcw3to8_fp16(self):
         Y._attrs["is_output"] = True
         module = compile_model(Y, target, "./tmp", "nhwc3to4")
         for batch in batch_size:
-            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype("float16")
-            Y_np = np.zeros((batch, 224, 224, 4)).astype("float16")
+            X_np = np.random.uniform(-1, 1, (batch, 224, 224, 3)).astype(dtype)
+            Y_np = np.zeros((batch, 224, 224, 4)).astype(dtype)
             Y_np[:, :, :, 0] = X_np[:, :, :, 0]
             Y_np[:, :, :, 1] = X_np[:, :, :, 1]
             Y_np[:, :, :, 2] = X_np[:, :, :, 2]
             Y_pt = torch.from_numpy(Y_np).cuda()
             X_pt = torch.from_numpy(X_np).cuda()
-            y = torch.empty([batch, 224, 224, 4]).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
+    def test_nhcw3to4_f16(self):
+        self._test_nhcw3to4()
+
+    def test_nhcw3to4_f32(self):
+        self._test_nhcw3to4(dtype="float32")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_nms.py b/tests/unittest/ops/test_nms.py
index 2f81e9b02..e6cdd6b4b 100644
--- a/tests/unittest/ops/test_nms.py
+++ b/tests/unittest/ops/test_nms.py
@@ -24,6 +24,7 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     from torchvision.ops import boxes as box_ops
@@ -41,7 +42,7 @@ def nonempty(box, threshold=0.0):
     return keep
 
 
-def create_tensors(N):
+def create_tensors(N, dtype="float16"):
     dets = np.array(
         [
             [1.5862e02, 1.6100e02, 4.2800e02, 3.9400e02, 7.7100e-01],
@@ -75,7 +76,7 @@ def create_tensors(N):
             [1.4962e02, 1.6250e02, 4.3650e02, 3.9800e02, 7.9492e-01],
             [1.4850e02, 1.5975e02, 4.3250e02, 3.9275e02, 2.7051e-01],
         ],
-        dtype="float16",
+        dtype=dtype,
     )
 
     return dets[:N, :4], dets[:N, -1]
@@ -83,10 +84,10 @@ def create_tensors(N):
 
 @skipIfNoTorchVision
 class nmsTestCase(unittest.TestCase):
-    def _create_tensors(self, N):
+    def _create_tensors(self, N, dtype="float16"):
         boxes, scores = create_tensors(N)
-
-        return torch.tensor(boxes).cuda().half(), torch.tensor(scores).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        return [torch.tensor(x).cuda().to(dtype=torch_dtype) for x in (boxes, scores)]
 
     def _test_nms(
         self,
@@ -98,19 +99,20 @@ def _test_nms(
         num_classes=1,
         test_name="proposal_nms",
         copy_op=False,
+        dtype="float16",
     ):
         target = detect_target()
 
         X1 = Tensor(
             shape=[1, N, 4],
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
 
         X2 = Tensor(
             shape=[1, N],
-            dtype="float16",
+            dtype=dtype,
             name="kernel",
             is_input=True,
         )
@@ -127,59 +129,83 @@ def _test_nms(
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
 
-        module = compile_model(X4, target, "./tmp", test_name + str(copy_op))
+        module = compile_model(X4, target, "./tmp", test_name)
 
-        boxes, scores = self._create_tensors(N)
-        idxs = torch.randint(0, num_classes, (N,)).cuda().half()
-        iou = iouThreshold
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, dtype=dtype)
+        idxs = torch.randint(0, num_classes, (N,)).cuda().to(dtype=torch_dtype)
         kept = nonempty(boxes, threshold=minBoxSize)
         score_pt = scores.clone()
         score_pt[kept] = -1
-        keep = box_ops.batched_nms(boxes, score_pt, idxs, iou)
+        keep = box_ops.batched_nms(boxes, score_pt, idxs, iouThreshold)
 
         if keep.shape[0] >= nmsMaxOut:
             keep = keep[:nmsMaxOut]
             ref_box = boxes[keep]
         else:
-            ref_box = torch.zeros(nmsMaxOut, 4).half()
+            ref_box = torch.zeros(nmsMaxOut, 4)
             ref_box[
                 : keep.shape[0],
             ] = boxes[keep]
+        ref_box = ref_box.to(dtype=torch_dtype)
 
         x = boxes.reshape((1, N, 4)).contiguous()
         x_scores = scores.reshape((1, N)).contiguous()
         inputs = [x, x_scores]
-        y = torch.empty([1, nmsMaxOut, 4]).cuda().half()
+        y = torch.empty([1, nmsMaxOut, 4]).cuda().to(dtype=torch_dtype)
         module.run_with_tensors(inputs, [y])
         self.assertTrue(torch.allclose(ref_box.cuda(), y, atol=1e-2, rtol=1e-2))
 
-    def test_nms(self):
-        self._test_nms()
-        self._test_nms(copy_op=True)
+    def test_nms_fp16(self):
+        self._test_nms(
+            test_name="proposal_nms_fp16",
+            dtype="float16",
+        )
+        self._test_nms(
+            test_name="proposal_nms_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    def test_nms_fp32(self):
+        self._test_nms(
+            test_name="proposal_nms_fp32",
+            dtype="float32",
+        )
+        self._test_nms(
+            test_name="proposal_nms_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
+        )
 
     def _test_topk_nms(
-        self, batch_size=1, N=30, topK=30, iou=0.5, test_name="topk_nms", copy_op=False
+        self,
+        batch_size=1,
+        N=30,
+        topK=30,
+        iou=0.5,
+        test_name="topk_nms",
+        copy_op=False,
+        dtype="float16",
     ):
-
         target = detect_target()
-        if target.name() == "rocm":
-            return
         m_shape = (N, 4)
 
         def model():
             X_boxes = Tensor(
                 shape=m_shape,
-                dtype="float16",
+                dtype=dtype,
                 name="X",
                 is_input=True,
             )
             X_scores = Tensor(
                 shape=[N],
-                dtype="float16",
+                dtype=dtype,
                 name="scores",
                 is_input=True,
             )
-            score_inds = ops.topk(k=topK)(X_scores)
+            _, score_inds = ops.topk(k=topK)(X_scores)
             bboxes = ops.batch_gather()(X_boxes, score_inds)
             OP = ops.batched_nms(iou_threshold=iou, keep_n=N)
             if copy_op:
@@ -195,8 +221,9 @@ def model():
 
         module = compile_model(Y, target, "./tmp", test_name)
 
-        boxes, scores = self._create_tensors(N)
-        idxs = torch.randint(0, 1, (N,)).cuda().half()
+        torch_dtype = string_to_torch_dtype(dtype)
+        boxes, scores = self._create_tensors(N, dtype=dtype)
+        idxs = torch.randint(0, 1, (N,)).cuda().to(dtype=torch_dtype)
         y_pt = box_ops.batched_nms(boxes, scores, idxs, iou)
         y_np = y_pt.cpu().numpy()
 
@@ -215,9 +242,29 @@ def model():
         y = score_inds[index]
         np.testing.assert_allclose(y_np, y, atol=1e-2, rtol=1e-2)
 
-    def test_topk_nms(self):
-        self._test_topk_nms()
-        self._test_topk_nms(copy_op=True)
+    @unittest.skipIf(detect_target().name() == "rocm", "not supported in ROCm")
+    def test_topk_nms_fp16(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp16",
+            dtype="float16",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp16",
+            copy_op=True,
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "float23 not supported in ROCm")
+    def test_topk_nms_fp32(self):
+        self._test_topk_nms(
+            test_name="topk_nms_fp32",
+            dtype="float32",
+        )
+        self._test_topk_nms(
+            test_name="topk_nms_copy_op_fp32",
+            copy_op=True,
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_nn_gelu.py b/tests/unittest/ops/test_nn_gelu.py
new file mode 100644
index 000000000..c252d6c87
--- /dev/null
+++ b/tests/unittest/ops/test_nn_gelu.py
@@ -0,0 +1,67 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+from aitemplate.compiler import compile_model
+
+from aitemplate.frontend import Tensor
+from aitemplate.frontend.nn.activation import GELU
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+
+
+class GELUTestCase(unittest.TestCase):
+    def _test_gelu(self, approximate, dtype="float16"):
+        input_shape = (3, 10, 20)
+
+        X_pt = get_random_torch_tensor(input_shape, dtype=dtype)
+        OP_pt = torch.nn.GELU(approximate=approximate).cuda().half()
+        Y_pt = OP_pt(X_pt)
+        X_ait = Tensor(
+            shape=input_shape,
+            dtype=dtype,
+            name="input0",
+            is_input=True,
+        )
+        OP_ait = GELU(approximate=approximate)
+        Y_ait = OP_ait(X_ait)
+
+        Ys_ait = Ys_ait = [var._attrs["values"][0] for var in Y_ait._attrs["shape"]]
+        self.assertEqual(list(Y_pt.shape), Ys_ait)
+
+        Y_ait._attrs["name"] = "output_0"
+        Y_ait._attrs["is_output"] = True
+
+        target = detect_target()
+        module = compile_model(Y_ait, target, "./tmp", "gelu")
+
+        y = get_torch_empty_tensor(Ys_ait, dtype=dtype)
+        inputs = {"input0": X_pt}
+        module.run_with_tensors(inputs, [y])
+
+        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+    def test_gelu(self):
+        self._test_gelu(approximate="none")
+        self._test_gelu(approximate="tanh")
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_norm.py b/tests/unittest/ops/test_norm.py
index 1d6abeba6..848c2615c 100644
--- a/tests/unittest/ops/test_norm.py
+++ b/tests/unittest/ops/test_norm.py
@@ -15,20 +15,17 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class VectorNormTestCase(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super(VectorNormTestCase, self).__init__(*args, **kwargs)
-
     def _run_vector_norm(
         self,
         *,
@@ -40,6 +37,8 @@ def _run_vector_norm(
         input_type="float16",
         output_type=None,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -66,21 +65,31 @@ def _run_vector_norm(
 
         module = compile_model(Y, target, "./tmp", test_name)
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        output_dtype_pt = (
+            string_to_torch_dtype(output_type)
+            if output_type is not None
+            else string_to_torch_dtype(input_type)
+        )
         Y_pt = torch.linalg.vector_norm(
-            X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+            X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=output_dtype_pt
         )
-
-        y = torch.empty(y_shape).half().cuda()
+        y = torch.empty(y_shape, dtype=output_dtype_pt).cuda()
         module.run_with_tensors([X_pt], [y])
-        y_pt = Y_pt.cpu().numpy()
 
-        np.testing.assert_equal(y_shape, y_pt.shape)
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+        self.assertEqual(y_shape, list(Y_pt.shape))
+        self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
 
     def _run_l2_norm(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         self._run_vector_norm(
             test_name="l2_norm",
@@ -90,6 +99,8 @@ def _run_l2_norm(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            atol=atol,
+            rtol=rtol,
         )
         self._run_vector_norm(
             test_name="l2_norm_copy_op",
@@ -100,6 +111,8 @@ def _run_l2_norm(
             input_type=input_type,
             output_type=output_type,
             copy_op=True,
+            atol=atol,
+            rtol=rtol,
         )
 
     def test_l2_norm(self):
@@ -112,6 +125,146 @@ def test_l2_norm(self):
         self._run_l2_norm(dim=-1, input_shape=[4, 1230, 1237], keepdim=True)
         self._run_l2_norm(dim=-1, input_shape=[1, 1000000, 6], keepdim=True)
 
+    def test_l2_norm_fp32(self):
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[3, 2, 2048],
+            keepdim=False,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[3, 1234, 4],
+            keepdim=True,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=2,
+            input_shape=[5, 1, 34, 4],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[4, 1230, 1237],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+    def test_l2_norm_bf16(self):
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[3, 2, 2048],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[3, 1234, 4],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=1,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=0,
+            input_shape=[5, 60, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=2,
+            input_shape=[5, 1, 34, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[4, 1230, 1237],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_l2_norm(
+            dim=-1,
+            input_shape=[1, 1000000, 6],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type="float32",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
     def _run_batched_vector_norm(
         self,
         *,
@@ -147,7 +300,11 @@ def _run_batched_vector_norm(
 
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        output_dtype_pt = (
+            string_to_torch_dtype(output_type)
+            if output_type is not None
+            else string_to_torch_dtype(input_type)
+        )
         module = compile_model(Y, target, "./tmp", test_name)
 
         for B in [5, 128, 1024, 1237, 2002]:
@@ -156,15 +313,13 @@ def _run_batched_vector_norm(
 
             X_pt = get_random_torch_tensor(input_shape, input_type)
             Y_pt = torch.linalg.vector_norm(
-                X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=dtype_pt
+                X_pt, ord=ord_kind, dim=dim, keepdim=keepdim, dtype=output_dtype_pt
             )
-            y_pt = Y_pt.cpu().numpy()
-
-            y = torch.empty(y_pt.shape).cuda().half()
+            y = torch.empty(Y_pt.shape, dtype=output_dtype_pt).cuda()
             module.run_with_tensors([X_pt], [y])
 
-            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+            self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+            torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
 
     def _run_batched_l2_norm(
         self, *, dim, keepdim, input_type="float16", output_type=None
diff --git a/tests/unittest/ops/test_pad_last_dim.py b/tests/unittest/ops/test_pad_last_dim.py
index 61f778d59..7bf5d2b26 100644
--- a/tests/unittest/ops/test_pad_last_dim.py
+++ b/tests/unittest/ops/test_pad_last_dim.py
@@ -19,17 +19,31 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_zeros_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class PadLastDim(unittest.TestCase):
-    def _test_static_shape_4d(self, copy_op=False):
+    def _test_static_shape_4d(
+        self,
+        copy_op=False,
+        test_name="static_shape_4d",
+        dtype="float16",
+    ):
         NN = 2
         HH = 7
         WW = 7
         CI = 262
         CO = 264
-        X = Tensor(shape=[NN, HH, WW, CI], name="X", is_input=True)
+        X = Tensor(
+            shape=[NN, HH, WW, CI],
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
         op = ops.pad_last_dim(4, CO)
         if copy_op:
             op = ops.pad_last_dim(**op._get_op_attributes())
@@ -37,40 +51,95 @@ def _test_static_shape_4d(self, copy_op=False):
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", f"pad_last_dim4d_{copy_op}")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(NN, HH, WW, CI).cuda().half()
-        Pad_pt = torch.zeros(NN, HH, WW, CO - CI).cuda().half()
+        X_pt = get_random_torch_tensor([NN, HH, WW, CI], dtype=dtype)
+        Pad_pt = get_torch_zeros_tensor([NN, HH, WW, CO - CI], dtype=dtype)
         Y_pt = torch.cat([X_pt, Pad_pt], dim=3)
 
-        y = torch.empty([NN, HH, WW, CO]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
-    def test_static_shape_4d(self):
-        self._test_static_shape_4d()
-        self._test_static_shape_4d(copy_op=True)
+    def test_static_shape_4d_fp16(self):
+        self._test_static_shape_4d(
+            test_name="static_shape_4d_fp16",
+            dtype="float16",
+        )
+        self._test_static_shape_4d(
+            copy_op=True,
+            test_name="static_shape_4d_fp16_copy_op",
+            dtype="float16",
+        )
 
-    def test_static_shape_2d(self):
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_static_shape_4d_fp32(self):
+        self._test_static_shape_4d(
+            test_name="static_shape_4d_fp32",
+            dtype="float32",
+        )
+        self._test_static_shape_4d(
+            copy_op=True,
+            test_name="static_shape_4d_fp32_copy_op",
+            dtype="float32",
+        )
+
+    def _test_static_shape_2d(
+        self,
+        copy_op=False,
+        test_name="static_shape_2d",
+        dtype="float16",
+    ):
         NN = 32
         CI = 259
         CO = 264
-        X = Tensor(shape=[NN, CI], name="X", is_input=True)
+        X = Tensor(
+            shape=[NN, CI],
+            name="X",
+            is_input=True,
+            dtype=dtype,
+        )
         op = ops.pad_last_dim(2, CO)
+        if copy_op:
+            op = ops.pad_last_dim(**op._get_op_attributes())
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "pad_last_dim2d")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(NN, CI).cuda().half()
-        Pad_pt = torch.zeros(NN, CO - CI).cuda().half()
+        X_pt = get_random_torch_tensor([NN, CI], dtype=dtype)
+        Pad_pt = get_torch_zeros_tensor([NN, CO - CI], dtype=dtype)
         Y_pt = torch.cat([X_pt, Pad_pt], dim=1)
 
-        y = torch.empty([NN, CO]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
 
+    def test_static_shape_2d_fp16(self):
+        self._test_static_shape_2d(
+            test_name="static_shape_2d_fp16",
+            dtype="float16",
+        )
+        self._test_static_shape_2d(
+            copy_op=True,
+            test_name="static_shape_2d_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_static_shape_2d_fp32(self):
+        self._test_static_shape_2d(
+            test_name="static_shape_2d_fp32",
+            dtype="float32",
+        )
+        self._test_static_shape_2d(
+            copy_op=True,
+            test_name="static_shape_2d_fp32_copy_op",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_padded_dense_to_jagged.py b/tests/unittest/ops/test_padded_dense_to_jagged.py
new file mode 100644
index 000000000..1b58c1b3c
--- /dev/null
+++ b/tests/unittest/ops/test_padded_dense_to_jagged.py
@@ -0,0 +1,358 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+"""
+Unittests for the padded_dense_to_jagged op.
+"""
+
+import json
+import random
+import tempfile
+import unittest
+from typing import List
+
+import aitemplate.testing.jagged_utils as jagged_utils
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import JaggedDim
+from aitemplate.compiler.ops.common.epilogue import FuncEnum
+from aitemplate.frontend import IntImm, IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import param, parameterized
+
+
+class PaddedDenseToJaggedTestCase(unittest.TestCase):
+    def _test_padded_dense_to_jagged(
+        self,
+        jagged_max_shape: List[int],
+        offsets_list: List[List[int]],
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        pass_jagged_int_var_as_total_length: bool = False,
+        test_suffix: str = "",
+    ):
+        batch_size = jagged_max_shape[0]
+        batch_dim = IntVar(values=[1, batch_size * 2], name="batch_size")
+        sequence_shape = jagged_max_shape[1 : 1 + len(offsets_list)]
+        sequence_dims = [IntImm(value=dim) for dim in sequence_shape]
+        inner_shape = jagged_max_shape[1 + len(offsets_list) :]
+        inner_dims = [IntImm(value=dim) for dim in inner_shape]
+
+        total_length = offsets_list[-1][-1]
+        total_length_dim = IntVar(values=[1, total_length * 2], name="total_length")
+        jagged_dims = [JaggedDim(min_value=0, max_value=N) for N in sequence_shape]
+
+        offsets_dims = [
+            IntVar(values=[2, len(offsets) * 2]) for offsets in offsets_list
+        ]
+
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                *sequence_dims,
+                *inner_dims,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name=f"offsets{i}",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+            for i, offsets_dim in enumerate(offsets_dims)
+        ]
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                *inner_dims,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+
+        ANOTHER = ops.make_jagged(batch_dim=batch_dim, jagged_dims=jagged_dims)(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        total_length_to_pass = total_length_dim
+        if pass_jagged_int_var_as_total_length:
+            # we pass JaggedIntVar as the total_length to the
+            # padded_dense_to_jagged op, as this may happen in
+            # some cases where the total_length is fetched from
+            # the shape of a jagged tensor
+            total_length_to_pass = ANOTHER._attrs["shape"][0]
+
+        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_to_pass)(
+            x=DENSE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, ANOTHER)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        assert not DENSE.is_jagged()
+        assert JAGGED.is_jagged()
+        assert ANOTHER.is_jagged()
+        assert RESULT.is_jagged()
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"test_padded_dense_to_jagged_{test_suffix}",
+        )
+
+        torch_offsets_dtype = string_to_torch_dtype(offsets_dtype)
+        offsets_pt = {
+            f"offsets{i}": torch.tensor(offsets, dtype=torch_offsets_dtype).cuda()
+            for i, offsets in enumerate(offsets_list)
+        }
+        dense_pt = get_random_torch_tensor(jagged_max_shape, dtype)
+        result_pt = jagged_utils.dense_to_jagged(
+            dense=dense_pt,
+            offsets_list=list(offsets_pt.values()),
+        )
+
+        source = torch.zeros_like(result_pt)
+        result = torch.empty_like(result_pt)
+
+        inputs = {"dense": dense_pt, "source": source, **offsets_pt}
+        model.run_with_tensors(inputs, [result])
+
+        torch.testing.assert_close(result, result_pt)
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [4, 3, 8], "float16", False),
+            param(2, "int32", [4, 3, 4], "float16", False),
+            param(3, "int32", [4, 3, 2], "float16", False),
+            param(4, "int32", [4, 3, 1], "float16", True),
+            param(5, "int64", [4, 3, 4], "float32", False),
+            param(6, "int64", [4, 3, 2], "float32", False),
+            param(7, "int64", [4, 3, 1], "float32", True),
+        ]
+    )
+    def test_padded_dense_to_jagged_single_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+        pass_jagged_int_var_as_total_length,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_padded_dense_to_jagged(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[[0, 1, 4, 6, 7]],
+                dtype=dtype,
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                pass_jagged_int_var_as_total_length=pass_jagged_int_var_as_total_length,
+                test_suffix=f"single_offsets_{dtype}_{i}",
+            )
+
+    @parameterized.expand(
+        [
+            param(1, "int32", [3, 4, 5, 150, 3, 8], "float16"),
+            param(2, "int32", [3, 4, 5, 150, 1, 4], "float16"),
+            param(3, "int32", [3, 4, 5, 150, 3, 2], "float16"),
+            param(4, "int32", [3, 4, 5, 150, 1, 1], "float16"),
+            param(5, "int64", [3, 4, 5, 150, 1, 4], "float32"),
+            param(6, "int64", [3, 4, 5, 150, 3, 2], "float32"),
+            param(7, "int64", [3, 4, 5, 150, 3, 1], "float32"),
+        ]
+    )
+    def test_padded_dense_to_jagged_multiple_offsets(
+        self,
+        i,
+        offsets_dtype,
+        jagged_max_shape,
+        dtype,
+    ):
+        for use_jagged_space_indexing in [False, True]:
+            self._test_padded_dense_to_jagged(
+                jagged_max_shape=jagged_max_shape,
+                offsets_list=[
+                    [0, 1, 3, 5],
+                    [0, 2, 4, 7, 9, 10],
+                    [0, 6, 8, 19, 23, 45, 67, 98, 123, 256, 321],
+                ],
+                dtype=dtype,
+                offsets_dtype=offsets_dtype,
+                use_jagged_space_indexing=use_jagged_space_indexing,
+                test_suffix=f"multiple_offsets_{dtype}_{i}",
+            )
+
+    def _benchmark_padded_dense_to_jagged(
+        self,
+        B: int,
+        N: int,
+        D: int,
+        dtype: str = "float16",
+        offsets_dtype: str = "int32",
+        use_jagged_space_indexing: bool = False,
+        test_suffix: str = "",
+        num_iters: int = 1000,
+    ):
+        batch_dim = IntVar(values=[1, B], name="batch_size")
+        sequence_dim = IntImm(value=N, name="sequence_dim")
+        total_length_dim = IntVar(values=[1, B * N], name="total_length")
+        embedding_dim = IntImm(value=D, name="embedding_dim")
+        offsets_dim = IntVar(values=[2, B + 1], name="offsets_dim")
+
+        DENSE = Tensor(
+            shape=[
+                batch_dim,
+                sequence_dim,
+                embedding_dim,
+            ],
+            name="dense",
+            dtype=dtype,
+            is_input=True,
+        )
+        OFFSETS_LIST = [
+            Tensor(
+                shape=[
+                    offsets_dim,
+                ],
+                name="offsets",
+                dtype=offsets_dtype,
+                is_input=True,
+            )
+        ]
+
+        JAGGED = ops.padded_dense_to_jagged(total_length=total_length_dim)(
+            x=DENSE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        SOURCE = Tensor(
+            shape=[
+                total_length_dim,
+                embedding_dim,
+            ],
+            name="source",
+            dtype=dtype,
+            is_input=True,
+        )
+        ANOTHER = ops.make_jagged(
+            batch_dim=batch_dim,
+            jagged_dims=[JaggedDim(min_value=0, max_value=N)],
+        )(
+            source=SOURCE,
+            offsets_list=OFFSETS_LIST,
+        )
+
+        RESULT = ops.elementwise(FuncEnum.ADD)(JAGGED, ANOTHER)
+
+        RESULT._attrs["name"] = "result"
+        RESULT._attrs["is_output"] = True
+
+        model = compile_model(
+            [RESULT],
+            detect_target(use_jagged_space_indexing=use_jagged_space_indexing),
+            "./tmp",
+            f"benchmark_padded_dense_to_jagged_{test_suffix}",
+        )
+
+        random.seed(0)
+        load_factors = [i / 20 for i in range(1, 21)]
+        offset_tensors = [
+            jagged_utils.generate_offsets(
+                batch_size=B,
+                max_seq_len=N,
+                load_factor=load_factor,
+                offsets_dtype=offsets_dtype,
+            )
+            for load_factor in load_factors
+        ]
+
+        results = []
+        for load_factor, offsets_pt in zip(load_factors, offset_tensors):
+            total_length = offsets_pt[-1].item()
+            dense_pt = get_random_torch_tensor([B, N, D], dtype)
+            inputs = {"dense": dense_pt, "offsets": offsets_pt}
+            outputs = [get_torch_empty_tensor([total_length, D], dtype)]
+            source_pt = get_random_torch_tensor([total_length, D], dtype)
+            inputs["source"] = source_pt
+
+            with tempfile.NamedTemporaryFile("r") as f:
+                model.profile_with_tensors(
+                    inputs=inputs,
+                    outputs=outputs,
+                    num_iters=num_iters,
+                    filename=f.name,
+                )
+                profiling_data = json.loads(f.read())
+                padded_dense_to_jagged_records = [
+                    profiling_data[func_name]
+                    for func_name in profiling_data
+                    if func_name.startswith("padded_dense_to_jagged")
+                ]
+                assert len(padded_dense_to_jagged_records) == 1
+                runtime_ms = padded_dense_to_jagged_records[0]["ms_per_iter"]
+
+            dense_item = total_length * D  # total items to read: the jagged volume
+            jagged_item = total_length * D  # total items to read: the jagged volume
+            size = 2 if dtype == "float16" else 4  # size of individual data value
+            bandwidth = (
+                (jagged_item + dense_item) * size / (runtime_ms * 1e-3 * 1e9)
+            )  # GB/s
+            results.append([load_factor, runtime_ms, bandwidth])
+
+        print()
+        print(f"{B=}, {N=}, {D=}, {dtype=}:")
+        print()
+
+        for load_factor, runtime_ms, bandwidth in results:
+            print(
+                f"load factor: {int(load_factor * 100)}%, "
+                f"runtime: {round(runtime_ms, 6)} ms, "
+                f"bandwidth: {round(bandwidth, 3)} GB/s"
+            )
+
+    def _test_benchmark_padded_dense_to_jagged(self):
+        self._benchmark_padded_dense_to_jagged(
+            B=1024,
+            N=260,
+            D=256,
+            dtype="float16",
+            offsets_dtype="int32",
+            use_jagged_space_indexing=False,
+            isolated_total_length=True,
+            test_suffix="benchmark",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr.py b/tests/unittest/ops/test_perm021fc_ccr.py
index 9e20ddc71..4061fdb60 100644
--- a/tests/unittest/ops/test_perm021fc_ccr.py
+++ b/tests/unittest/ops/test_perm021fc_ccr.py
@@ -18,7 +18,6 @@
 # _3308 = torch.nn.functional.linear(_3307, self._1184, bias=self._1185)  # FC
 """
 
-
 import unittest
 
 import torch
@@ -26,37 +25,76 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class BMMTestCase(unittest.TestCase):
-    def test_ccr(self):
+class Perm021FCCCRTestCase(unittest.TestCase):
+    def _test_perm021fc_ccr(
+        self,
+        test_name="perm021fc_ccr",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 745
         # K = 752
         N = 30
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt.unsqueeze(0)}, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_ccr_fp16(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_fp16",
+            dtype="float16",
+        )
+
+    def test_perm021fc_ccr_float32_sm80(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_fp32",
+            dtype="float32",
+        )
+
+    def test_perm021fc_ccr_bf16(self):
+        self._test_perm021fc_ccr(
+            test_name="perm021fc_ccr_bf16",
+            dtype="bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(Perm021FCCCRTestCase)
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias.py b/tests/unittest/ops/test_perm021fc_ccr_bias.py
index 2b5916bb0..b4e8525d6 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias.py
@@ -25,11 +25,18 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class Perm021FCCCRBiasTestCase(unittest.TestCase):
-    def test_ccr(self):
+    def _test_perm021fc_ccr_bias(
+        self,
+        test_name="perm021fc_ccr_bias",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         # K = 745
@@ -37,32 +44,68 @@ def test_ccr(self):
         # N = 30
         N = 64
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        # B_pt = torch.randn(N).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
 
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_ccr_bias_fp16(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_fp16",
+            dtype="float16",
+        )
+
+    def test_perm021fc_ccr_bias_float32_sm80(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_fp32",
+            dtype="float32",
+        )
+
+    def test_perm021fc_ccr_bias_bf16(self):
+        self._test_perm021fc_ccr_bias(
+            test_name="perm021fc_ccr_bias_bf16",
+            dtype="bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(Perm021FCCCRBiasTestCase)
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
index f9fbc8cf4..5ba303241 100644
--- a/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
+++ b/tests/unittest/ops/test_perm021fc_ccr_bias_perm021.py
@@ -25,12 +25,23 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skip("Re-enable after cutlass fix")
-# @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021FCCCRBiasTestCase(unittest.TestCase):
-    def test_ccr(self):
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class Perm021FCCCRBiasPerm021TestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_perm021fc_ccr_bias_perm021(
+        self,
+        test_name="perm021fc_ccr_bias_perm021",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         # K = 745
@@ -38,32 +49,71 @@ def test_ccr(self):
         # N = 30
         N = 64
         target = detect_target()
-        X = Tensor(shape=[B, K, M], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[1, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[1, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_ccr_bias_permute()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        # B_pt = torch.randn(N).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N))
         Y_pt = Y_pt.permute(0, 2, 1)
-        y = torch.empty([B, N, M]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt.unsqueeze(0), "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    # !!! SKIPPED TESTS BELOW !!!
+    # Permute3DBMM_021 layout not currently present in CUTLASS
+    # TODO: enable the tests after this layout becomes available
+
+    # def test_perm021fc_ccr_bias_perm021_fp16(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_fp16",
+    #         dtype="float16",
+    #     )
+
+    # def test_perm021fc_ccr_bias_perm021_fp32_sm80(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_fp32",
+    #         dtype="float32",
+    #     )
+
+    # def test_perm021fc_ccr_bias_perm021_bf16_sm80(self):
+    #     self._test_perm021fc_ccr_bias_perm021(
+    #         test_name="perm021fc_ccr_bias_perm021_bf16",
+    #         dtype="bfloat16",
+    #     )
+
+
+filter_test_cases_by_test_env(Perm021FCCCRBiasPerm021TestCase)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc.py b/tests/unittest/ops/test_perm021fc_crc.py
index 30faa3cc4..cc9b87215 100644
--- a/tests/unittest/ops/test_perm021fc_crc.py
+++ b/tests/unittest/ops/test_perm021fc_crc.py
@@ -25,27 +25,44 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021BMMTestCase(unittest.TestCase):
-    def test_crc(self):
+class Perm021FCCRCTestCase(unittest.TestCase):
+    def _test_perm021fc_crc(
+        self,
+        test_name="perm021fc_crc",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 742
         # K = 752
         N = 64
         target = detect_target()
-        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(
+            shape=[1, K, N],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
         OP = ops.perm021fc_crc()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_crc")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
@@ -53,11 +70,33 @@ def test_crc(self):
         Y_pt = torch.reshape(Y_pt, (B, M, N))
 
         WT = W_pt.transpose(0, 1).contiguous()
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": WT.unsqueeze(0), "input_1": X_pt}, [y])
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    def test_perm021fc_crc_fp16(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_fp16",
+            dtype="float16",
+        )
+
+    def test_perm021fc_crc_float32_sm80(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_fp32",
+            dtype="float32",
+        )
+
+    def test_perm021fc_crc_bf16(self):
+        self._test_perm021fc_crc(
+            test_name="perm021fc_crc_bf16",
+            dtype="bfloat16",
+        )
+
+
+filter_test_cases_by_test_env(Perm021FCCRCTestCase)
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm021fc_crc_bias.py b/tests/unittest/ops/test_perm021fc_crc_bias.py
index e880018c9..bb21b7771 100644
--- a/tests/unittest/ops/test_perm021fc_crc_bias.py
+++ b/tests/unittest/ops/test_perm021fc_crc_bias.py
@@ -26,41 +26,82 @@
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
 
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
+
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm021BMMTestCase(unittest.TestCase):
-    def test_crc(self):
+class Perm021FCCRCBiasTestCase(unittest.TestCase):
+    def _test_perm021fc_crc_bias(
+        self,
+        test_name="perm021fc_crc_bias",
+        dtype="float16",
+    ):
         B = 1024
         M = 128
         K = 742
         # K = 752
         N = 64
         target = detect_target()
-        X = Tensor(shape=[1, K, N], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, M], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(
+            shape=[1, K, N],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+        W = Tensor(
+            shape=[B, K, M],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        BIAS = Tensor(
+            shape=[N],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
+        )
         OP = ops.perm021fc_crc_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm021_fc_crc_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(B, K, M).cuda().half()
-        W_pt = torch.randn(N, K).cuda().half()
-        B_pt = torch.ones(N).cuda().half() * 0.5
+        X_pt = get_random_torch_tensor([B, K, M], dtype=dtype)
+        W_pt = get_random_torch_tensor([N, K], dtype=dtype)
+        B_pt = get_random_torch_tensor([N], dtype=dtype) * 0.5
 
         XT = X_pt.permute(0, 2, 1)
         XT = torch.reshape(XT, (-1, K))
         Y_pt = torch.nn.functional.linear(XT, W_pt, bias=B_pt)
         Y_pt = torch.reshape(Y_pt, (B, M, N)).contiguous()
         WT = W_pt.transpose(0, 1).contiguous()
-        y = torch.empty([B, M, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": WT.unsqueeze(0), "input_1": X_pt, "input_2": B_pt}, [y]
         )
 
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
 
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32"), ("bfloat16")],
+            }
+        )
+    )
+    def test_perm021fc_crc_bias(self, dtype):
+        self._test_perm021fc_crc_bias(
+            test_name=f"perm021fc_crc_bias_{dtype}",
+            dtype=dtype,
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_perm102_bmm_rcr.py b/tests/unittest/ops/test_perm102_bmm_rcr.py
index e7c44e1f3..39c306704 100644
--- a/tests/unittest/ops/test_perm102_bmm_rcr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rcr.py
@@ -28,68 +28,163 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMM_RCR_TestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr(self):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRCRTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rcr(
+        self,
+        B=25,
+        M=128,
+        K=256,
+        N=100,
+        dtype="float16",
+        test_name="perm102_bmm_rcr",
+    ):
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
         OP = ops.perm102_bmm_rcr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
 
         XT = X_pt.permute(1, 0, 2)
         Y_pt = torch.bmm(XT, W_pt.permute([0, 2, 1]))
-        Y_pt = Y_pt.permute(1, 0, 2)
-        y = torch.empty([M, B, N]).cuda().half()
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    def test_perm102_bmm_rcr_fp16(self):
+        self._test_perm102_bmm_rcr(
+            dtype="float16",
+            test_name="perm102_bmm_rcr_fp16",
+        )
+
+    def test_perm102_bmm_rcr_fp32_sm80(self):
+        self._test_perm102_bmm_rcr(
+            dtype="float32",
+            test_name="perm102_bmm_rcr_fp32",
+        )
+
+    def test_perm102_bmm_rcr_bf16(self):
+        self._test_perm102_bmm_rcr(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rcr_bf16",
+        )
+
+    def test_perm102_bmm_rcr_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rcr_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rcr_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rcr(
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rcr_bf16_force_sm90",
+            )
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMM_RCR_BiasTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr_bias(self):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRCRBiasTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rcr_bias(
+        self,
+        B=25,
+        M=128,
+        N=100,
+        K=256,
+        dtype="float16",
+        test_name="perm102_bmm_rcr_bias",
+    ):
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, N, K], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, N, K], dtype=dtype, name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.perm102_bmm_rcr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, N, K).cuda().half()
-        B_pt = torch.randn(B, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, N, K), dtype=dtype)
+        B_pt = get_random_torch_tensor(shape=(B, N), dtype=dtype)
 
         XT = X_pt.permute(1, 0, 2)
         Bias = B_pt.unsqueeze(1)
         Y_pt = torch.baddbmm(Bias, XT, W_pt.permute([0, 2, 1]))
-        Y_pt = Y_pt.permute(1, 0, 2)
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
 
-        y = torch.empty([M, B, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
         )
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    def test_perm102_bmm_rcr_bias_fp16(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="float16",
+            test_name="perm102_bmm_rcr_bias_fp16",
+        )
+
+    def test_perm102_bmm_rcr_bias_fp32_sm80(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="float32",
+            test_name="perm102_bmm_rcr_bias_fp32",
+        )
+
+    def test_perm102_bmm_rcr_bias_bf16(self):
+        self._test_perm102_bmm_rcr_bias(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rcr_bias_bf16",
+        )
+
+    def test_perm102_bmm_rcr_bias_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rcr_bias_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rcr_bias_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rcr_bias(
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rcr_bias_bf16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(Perm102BMMRCRTestCase)
+filter_test_cases_by_test_env(Perm102BMMRCRBiasTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_perm102_bmm_rrr.py b/tests/unittest/ops/test_perm102_bmm_rrr.py
index b45c46a34..e8851b56c 100644
--- a/tests/unittest/ops/test_perm102_bmm_rrr.py
+++ b/tests/unittest/ops/test_perm102_bmm_rrr.py
@@ -28,68 +28,169 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    env_variables,
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMMTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr(self):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRRRTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rrr(
+        self,
+        B=25,
+        M=128,
+        N=100,
+        K=256,
+        dtype="float16",
+        test_name="perm102_bmm_rrr",
+    ):
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
         OP = ops.perm102_bmm_rrr()
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
 
-        XT = X_pt.permute(1, 0, 2)
+        XT = X_pt.permute(1, 0, 2).contiguous()
         Y_pt = torch.bmm(XT, W_pt)
-        Y_pt = Y_pt.permute(1, 0, 2)
-        y = torch.empty([M, B, N]).cuda().half()
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    def test_perm102_bmm_rrr_fp16(self):
+        self._test_perm102_bmm_rrr(
+            dtype="float16",
+            test_name="perm102_bmm_rrr_fp16",
+        )
+
+    def test_perm102_bmm_rrr_fp32_sm80(self):
+        self._test_perm102_bmm_rrr(
+            dtype="float32",
+            test_name="perm102_bmm_rrr_fp32",
+        )
+
+    def test_perm102_bmm_rrr_bf16(self):
+        self._test_perm102_bmm_rrr(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rrr_bf16",
+        )
+
+    def test_perm102_bmm_rrr_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rrr_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rrr_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rrr(
+                N=64,
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rrr_bf16_force_sm90",
+            )
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class Perm102BMMBiasTestCase(unittest.TestCase):
-    def test_perm102_bmm_rrr_bias(self):
-        B = 25
-        M = 128
-        K = 256
-        N = 100
+class Perm102BMMRRRBiasTestCase(unittest.TestCase):
+    def _test_perm102_bmm_rrr_bias(
+        self,
+        B=25,
+        M=128,
+        K=256,
+        N=100,
+        dtype="float16",
+        test_name="perm102_bmm_rrr_bias",
+    ):
         target = detect_target()
-        X = Tensor(shape=[M, B, K], dtype="float16", name="input_0", is_input=True)
-        W = Tensor(shape=[B, K, N], dtype="float16", name="input_1", is_input=True)
-        BIAS = Tensor(shape=[B, N], dtype="float16", name="input_2", is_input=True)
+        X = Tensor(shape=[M, B, K], dtype=dtype, name="input_0", is_input=True)
+        W = Tensor(shape=[B, K, N], dtype=dtype, name="input_1", is_input=True)
+        BIAS = Tensor(shape=[B, N], dtype=dtype, name="input_2", is_input=True)
         OP = ops.perm102_bmm_rrr_bias()
         Y = OP(X, W, BIAS)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "perm102_bmm_rrr_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(M, B, K).cuda().half()
-        W_pt = torch.randn(B, K, N).cuda().half()
-        B_pt = torch.randn(B, N).cuda().half()
+        X_pt = get_random_torch_tensor(shape=(M, B, K), dtype=dtype)
+        W_pt = get_random_torch_tensor(shape=(B, K, N), dtype=dtype)
+        B_pt = get_random_torch_tensor(shape=(B, N), dtype=dtype)
 
-        XT = X_pt.permute(1, 0, 2)
+        XT = X_pt.permute(1, 0, 2).contiguous()
         Bias = B_pt.unsqueeze(1)
         Y_pt = torch.baddbmm(Bias, XT, W_pt)
-        Y_pt = Y_pt.permute(1, 0, 2)
+        Y_pt = Y_pt.permute(1, 0, 2).contiguous()
 
-        y = torch.empty([M, B, N]).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors(
             {"input_0": X_pt, "input_1": W_pt, "input_2": B_pt}, [y]
         )
 
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-1, rtol=1e-1))
+        torch.testing.assert_close(Y_pt, y, atol=1e-1, rtol=1e-1)
+
+    def test_perm102_bmm_rrr_bias_fp16(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="float16",
+            test_name="perm102_bmm_rrr_bias_fp16",
+        )
+
+    def test_perm102_bmm_rrr_bias_fp32_sm80(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="float32",
+            test_name="perm102_bmm_rrr_bias_fp32",
+        )
+
+    def test_perm102_bmm_rrr_bias_bf16(self):
+        self._test_perm102_bmm_rrr_bias(
+            dtype="bfloat16",
+            test_name="perm102_bmm_rrr_bias_bf16",
+        )
+
+    def test_perm102_bmm_rrr_bias_sm90(self):
+        with env_variables(
+            AIT_FORCE_CUTLASS_SM90_KERNELS="1",
+            INSIDE_RE_WORKER="1",
+        ):
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="float16",
+                test_name="perm102_bmm_rrr_bias_fp16_force_sm90",
+            )
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="float32",
+                test_name="perm102_bmm_rrr_bias_fp32_force_sm90",
+            )
+            self._test_perm102_bmm_rrr_bias(
+                N=64,
+                K=256,
+                dtype="bfloat16",
+                test_name="perm102_bmm_rrr_bias_bf16_force_sm90",
+            )
+
+
+filter_test_cases_by_test_env(Perm102BMMRRRTestCase)
+filter_test_cases_by_test_env(Perm102BMMRRRBiasTestCase)
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_permute.py b/tests/unittest/ops/test_permute.py
index f1e5b7456..8543cfff1 100644
--- a/tests/unittest/ops/test_permute.py
+++ b/tests/unittest/ops/test_permute.py
@@ -13,48 +13,44 @@
 #  limitations under the License.
 #
 import unittest
+from typing import Sequence
 
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import torch_dtype_to_string
 from parameterized import param, parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class PermuteTest(unittest.TestCase):
-    @parameterized.expand(
-        [
-            param((80, 300, 2), (0, 2, 1), "permute_1"),
-            param((80, 300, 2), (1, 0, 2), "permute_2"),
-            param((80, 300, 2), (2, 1, 0), "permute_3"),
-            param((5, 113, 15, 31), (0, 2, 1, 3), "permute_4"),
-            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4), "permute_5"),
-            param((8, 29, 100000, 3), (0, 2, 1, 3), "permute_6"),
-            param((32, 12, 4096, 64), (0, 2, 1, 3), "permute_7"),
-            param((1, 12, 128, 64), (0, 2, 1, 3), "permute_8"),
-            param((2, 3, 4, 5), (3, 2, 1, 0), "permute_9"),
-            param((3, 5, 128, 514), (2, 3, 0, 1), "permute_10"),
-            param((128, 512), (1, 0), "permute_11"),
-        ]
-    )
-    def test_static_shape_3d(self, input_shapes, dims, testname):
-        X = Tensor(shape=input_shapes, name="X", is_input=True)
+class GenericPermuteTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(GenericPermuteTest, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_generic_permute(
+        self,
+        input_shapes: Sequence[int],
+        dims: Sequence[int],
+        torch_dtype: torch.dtype,
+        testname: str,
+    ) -> None:
+        ait_dtype = torch_dtype_to_string(torch_dtype)
+        X = Tensor(shape=input_shapes, name="X", dtype=ait_dtype, is_input=True)
         op = ops.permute()
         Y = op(X, dims)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", testname)
+        module = compile_model(Y, target, "./tmp", f"{testname}_{self._test_id}")
+        self._test_id += 1
 
-        count = 1
-        for dim in input_shapes:
-            count *= dim
-        X_pt = torch.randn(input_shapes).cuda().half()
+        X_pt = torch.randn(input_shapes, dtype=torch_dtype).cuda()
         Y_pt = torch.permute(X_pt, dims)
 
-        y = torch.empty(Y_pt.size()).cuda().half()
+        y = torch.empty(Y_pt.size(), dtype=torch_dtype).cuda()
         module.run_with_tensors([X_pt], [y])
 
         # mean, _, _ = module.benchmark_with_tensors([X_pt], [y], count=1000)
@@ -64,8 +60,86 @@ def test_static_shape_3d(self, input_shapes, dims, testname):
         # bw = 2 * 2 * mem / (mean * 1e-3 * 1e9)  # GB/s
         # print(f"bw: {bw} GB/s")
 
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+        self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    def test_generic_permute_fp16(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.float16,
+            testname="test_generic_permute_fp16",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported by ROCm.")
+    def test_generic_permute_fp32(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.float32,
+            testname="test_generic_permute_fp32",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), (0, 2, 1)),
+            param((80, 300, 2), (1, 0, 2)),
+            param((80, 300, 2), (2, 1, 0)),
+            param((5, 113, 15, 31), (0, 2, 1, 3)),
+            param((3, 1, 113, 15, 64), (2, 0, 3, 1, 4)),
+            param((8, 29, 100000, 3), (0, 2, 1, 3)),
+            param((32, 12, 4096, 64), (0, 2, 1, 3)),
+            param((1, 12, 128, 64), (0, 2, 1, 3)),
+            param((2, 3, 4, 5), (3, 2, 1, 0)),
+            param((3, 5, 128, 514), (2, 3, 0, 1)),
+            param((128, 512), (1, 0)),
+            param((5, 113, 15, 31), (0, 1, 3, 2)),
+            param((3, 1, 113, 15, 64), (0, 1, 2, 4, 3)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported by ROCm.")
+    def test_generic_permute_bf16(self, input_shapes, dims):
+        self._test_generic_permute(
+            input_shapes=input_shapes,
+            dims=dims,
+            torch_dtype=torch.bfloat16,
+            testname="test_generic_permute_bf16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute021.py b/tests/unittest/ops/test_permute021.py
index 4d040657d..df7b2be39 100644
--- a/tests/unittest/ops/test_permute021.py
+++ b/tests/unittest/ops/test_permute021.py
@@ -17,29 +17,105 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute021(unittest.TestCase):
-    def test_static_shape_3d(self):
-        NN = 2
-        WW = 384
-        CI = 262
-        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+class Permute021Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute021Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_021(
+        self,
+        input_shape,
+        dims,
+        test_name="permute021",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
         op = ops.permute021()
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "perm021")
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, dims)
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    def test_permute021_fp16(self, id, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name=f"permute021_fp16_{id}",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute021_fp32(self, id, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name=f"permute021_fp32_{id}",
+            dtype="float32",
+        )
 
-        X_pt = torch.randn(NN, WW, CI).cuda().half()
-        Y_pt = torch.permute(X_pt, [0, 2, 1])
-        y = torch.empty([NN, CI, WW]).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+    @parameterized.expand(
+        [
+            param(1, (2, 384, 262), (0, 2, 1)),
+            param(2, (2, 3, 384, 262), (0, 1, 3, 2)),
+            param(3, (2, 3, 4, 384, 262), (0, 1, 2, 4, 3)),
+            param(4, (IntVar([2, 3]), 384, 262), (0, 2, 1)),
+            param(5, (IntVar([2, 3, 4]), 5, 384, 262), (0, 1, 3, 2)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute021_bf16(self, id, input_shape, dims):
+        self._test_permute_021(
+            input_shape=input_shape,
+            dims=dims,
+            test_name=f"permute021_bf16_{id}",
+            dtype="bfloat16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute0213.py b/tests/unittest/ops/test_permute0213.py
new file mode 100644
index 000000000..8c4d889cb
--- /dev/null
+++ b/tests/unittest/ops/test_permute0213.py
@@ -0,0 +1,104 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+class Permute0213Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute0213Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_0213(
+        self,
+        input_shape,
+        test_name="permute0213",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.permute0213()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"perm0213_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [0, 2, 1, 3])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((1, 80, 300, 2)),
+            param((5, 31, 7, 3)),
+            param((4, 256, 128, 7)),
+            param((7, 128, 256, 8)),
+            param((32, 128, 128, 63)),
+            param((33, 256, 256, 64)),
+            param((IntVar([2, 3]), 33, 256, 64)),
+        ]
+    )
+    def test_permute0213_fp16(self, input_shape):
+        self._test_permute_0213(
+            input_shape=input_shape,
+            test_name="permute0213_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((1, 80, 300, 2)),
+            param((5, 31, 7, 3)),
+            param((4, 256, 128, 7)),
+            param((7, 128, 256, 8)),
+            param((32, 128, 128, 63)),
+            param((33, 256, 256, 64)),
+            param((IntVar([2, 3]), 33, 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute0213_fp32(self, input_shape):
+        self._test_permute_0213(
+            input_shape=input_shape,
+            test_name="permute0213_fp32",
+            dtype="float32",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_permute102.py b/tests/unittest/ops/test_permute102.py
index c2901fa0b..588d1edd1 100644
--- a/tests/unittest/ops/test_permute102.py
+++ b/tests/unittest/ops/test_permute102.py
@@ -17,29 +17,107 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute102(unittest.TestCase):
-    def test_static_shape_3d(self):
-        NN = 80
-        WW = 300
-        CI = 2
-        X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
+class Permute102Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute102Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_102(
+        self,
+        input_shape,
+        test_name="permute102",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
         op = ops.permute102()
         Y = op(X)
         Y._attrs["is_output"] = True
         Y._attrs["name"] = "output"
         target = detect_target()
-        module = compile_model(Y, target, "./tmp", "perm102")
+        module = compile_model(Y, target, "./tmp", f"perm102_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [1, 0, 2])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    def test_permute102_fp16(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute102_fp32(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_fp32",
+            dtype="float32",
+        )
 
-        X_pt = torch.randn(NN, WW, CI).cuda().half()
-        Y_pt = torch.permute(X_pt, [1, 0, 2])
-        y = torch.empty([WW, NN, CI]).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+    @parameterized.expand(
+        [
+            param((80, 300, 2)),
+            param((31, 7, 3)),
+            param((256, 128, 7)),
+            param((128, 256, 8)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute102_bf16(self, input_shape):
+        self._test_permute_102(
+            input_shape=input_shape,
+            test_name="permute102_bf16",
+            dtype="bfloat16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_permute210.py b/tests/unittest/ops/test_permute210.py
index 78c480e19..c9135e3f6 100644
--- a/tests/unittest/ops/test_permute210.py
+++ b/tests/unittest/ops/test_permute210.py
@@ -12,37 +12,112 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import itertools
 import unittest
 
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.frontend import Tensor
+from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
 
 
-class Permute210(unittest.TestCase):
-    def test_static_shape_3d(self):
-        for NWC in itertools.product([2, 80, 300], [2, 80, 300], [2, 80, 300]):
-            with self.subTest(NWC=NWC):
-                NN, WW, CI = NWC
-                X = Tensor(shape=[NN, WW, CI], name="X", is_input=True)
-                op = ops.permute210()
-                Y = op(X)
-                Y._attrs["is_output"] = True
-                Y._attrs["name"] = "output"
-                target = detect_target()
-                module = compile_model(
-                    Y, target, "./tmp", "perm210_{}_{}_{}".format(NN, WW, CI)
-                )
-
-                X_pt = torch.randn(NN, WW, CI).cuda().half()
-                Y_pt = torch.permute(X_pt, [2, 1, 0])
-                y = torch.empty([CI, WW, NN]).cuda().half()
-                module.run_with_tensors([X_pt], [y])
-                self.assertTrue(torch.allclose(y, Y_pt, atol=1e-2, rtol=1e-2))
+class Permute210Test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(Permute210Test, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_permute_210(
+        self,
+        input_shape,
+        test_name="permute210",
+        dtype="float16",
+    ):
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.permute210()
+        Y = op(X)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"perm210_{self._test_id}")
+        self._test_id += 1
+
+        batch_dim = input_shape[0]
+        if isinstance(batch_dim, IntVar):
+            input_shapes = [(d, *input_shape[1:]) for d in batch_dim._attrs["values"]]
+        else:
+            input_shapes = [input_shape]
+
+        for shape in input_shapes:
+            X_pt = get_random_torch_tensor(shape, dtype=dtype)
+            Y_pt = torch.permute(X_pt, [2, 1, 0])
+            y = torch.empty_like(Y_pt).contiguous()
+            module.run_with_tensors([X_pt], [y])
+            self.assertTrue(torch.equal(y, Y_pt))
+
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    def test_permute210_fp16(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "FP32 is not supported on ROCm")
+    def test_permute210_fp32(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_fp32",
+            dtype="float32",
+        )
+
+    @parameterized.expand(
+        [
+            param((2, 80, 300)),
+            param((80, 300, 2)),
+            param((300, 2, 80)),
+            param((31, 7, 3)),
+            param((128, 128, 63)),
+            param((256, 256, 64)),
+            param((IntVar([2, 3]), 256, 64)),
+        ]
+    )
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 is not supported on ROCm")
+    def test_permute210_bf16(self, input_shape):
+        self._test_permute_210(
+            input_shape=input_shape,
+            test_name="permute210_bf16",
+            dtype="bfloat16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_proposal.py b/tests/unittest/ops/test_proposal.py
index 30862be85..cd06b9b3f 100644
--- a/tests/unittest/ops/test_proposal.py
+++ b/tests/unittest/ops/test_proposal.py
@@ -23,6 +23,7 @@
 
 from aitemplate.frontend import nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 DEBUG = False
 
@@ -409,7 +410,11 @@ def mark_output(y):
 
 
 class ProposalTestCase(unittest.TestCase):
-    def test_fp16_single_op(self, test_name="proposal"):
+    def _test_single_op(
+        self,
+        test_name="proposal",
+        dtype="float16",
+    ):
         target = detect_target()
         feat_stride = 16
         scales = [128, 256, 512]
@@ -447,20 +452,27 @@ def test_fp16_single_op(self, test_name="proposal"):
         scores = np.repeat(scores, repeats=batch_size, axis=0)
 
         bbox_deltas_ait = np.transpose(
-            bbox_deltas.astype("float16"), (0, 2, 3, 1)
+            bbox_deltas.astype(dtype),
+            (0, 2, 3, 1),
+        ).copy()
+        scores_ait = np.transpose(
+            scores.astype(dtype),
+            (0, 2, 3, 1),
         ).copy()
-        scores_ait = np.transpose(scores.astype("float16"), (0, 2, 3, 1)).copy()
 
         X_bbox_deltas = Tensor(
             shape=bbox_deltas_ait.shape,
             name="X_bbox_deltas",
-            dtype="float16",
+            dtype=dtype,
             is_input=True,
         )
-
         X_scores = Tensor(
-            shape=scores_ait.shape, name="X_scores", dtype="float16", is_input=True
+            shape=scores_ait.shape,
+            name="X_scores",
+            dtype=dtype,
+            is_input=True,
         )
+
         OP = nn.Proposal(
             im_shape=im_info[:2],
             scales=scales,
@@ -472,6 +484,7 @@ def test_fp16_single_op(self, test_name="proposal"):
             iou_threshold=threshold,
             rpn_min_size=rpn_min_size,
             batch_size=batch_size,
+            dtype=dtype,
         )
 
         y = OP(X_bbox_deltas, X_scores)
@@ -482,18 +495,33 @@ def test_fp16_single_op(self, test_name="proposal"):
         batch_inds = torch.from_numpy(OP._batch_inds.copy()).cuda()
         module.set_constant_with_tensor("anchors", anchors)
         module.set_constant_with_tensor("batch_inds", batch_inds)
+        torch_dtype = string_to_torch_dtype(dtype)
         inputs_pt = [
-            torch.from_numpy(bbox_deltas_ait).cuda().half(),
-            torch.from_numpy(scores_ait).cuda().half(),
+            torch.from_numpy(bbox_deltas_ait).cuda().to(torch_dtype),
+            torch.from_numpy(scores_ait).cuda().to(torch_dtype),
         ]
         out0_shape = module.get_output_maximum_shape(0)
-        out0 = torch.empty(out0_shape).cuda().half()
+        out0 = torch.empty(out0_shape, dtype=torch_dtype, device="cuda")
         y_ait_shape = module.get_output_maximum_shape(1)
-        y_ait = torch.empty(y_ait_shape).cuda().half()
+        y_ait = torch.empty(y_ait_shape, dtype=torch_dtype, device="cuda")
         module.run_with_tensors(inputs_pt, [out0, y_ait])
         y_ait = y_ait.reshape(2, -1, 4)
         self.assertTrue(torch.allclose(y_ait[0, :], y_ait[1, :], atol=1e-2, rtol=1e-2))
 
+    def test_proposal_fp16(self):
+        self._test_single_op(
+            test_name="proposal_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_proposal_fp32(self):
+        self._test_single_op(
+            test_name="proposal_fp32",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_reduce.py b/tests/unittest/ops/test_reduce.py
index b091c81c9..38fe1522e 100644
--- a/tests/unittest/ops/test_reduce.py
+++ b/tests/unittest/ops/test_reduce.py
@@ -15,16 +15,17 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
-logger = logging.getLogger(__name__)
+
+_LOGGER = logging.getLogger(__name__)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
@@ -44,14 +45,17 @@ def _run_reduce(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
+        rtol=1e-2,
+        atol=1e-2,
     ):
         torch.manual_seed(0)
-        logger.info(
+        _LOGGER.info(
             "Test input_shape={input_shape}, reduction_axes={dim}".format(
                 input_shape=input_shape, dim=dim
             )
         )
-        target = detect_target()
+        target = detect_target(use_fp16_acc=use_fp16_acc)
         X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
 
         if keepdim is None:
@@ -64,32 +68,40 @@ def _run_reduce(
         y_shape = [var._attrs["values"][0] for var in Y._attrs["shape"]]
         y_dtype = Y._attrs["dtype"]
 
-        logger.info("AITemplate output_shape: {}".format(y_shape))
-        logger.info("AITemplate output_type: {}".format(y_dtype))
+        _LOGGER.info("AITemplate output_shape: {}".format(y_shape))
+        _LOGGER.info("AITemplate output_type: {}".format(y_dtype))
 
         dll_name = f"test_{self.test_count}.so"
         module = compile_model(Y, target, "./tmp", test_name, dll_name=dll_name)
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        dtype_pt = dtype_to_torch_dtype(output_type)
+        dtype_pt = string_to_torch_dtype(output_type)
         if keepdim is None:
             Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
         else:
             Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
 
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
-        y_pt = Y_pt.cpu().numpy()
 
-        np.testing.assert_equal(y_shape, y_pt.shape)
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+        torch.testing.assert_close(y_shape, Y_pt.shape)
+        self.assertEqual(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        torch.testing.assert_close(Y_pt, y, atol=atol, rtol=rtol)
         self.test_count += 1
 
     def _run_reduce_sum(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+        use_fp16_acc=False,
+        rtol=1e-2,
+        atol=1e-2,
     ):
         self._run_reduce(
-            test_name="reduce_sum",
+            test_name=f"reduce_sum_{input_type}_{output_type}",
             reduce_op=ops.reduce_sum,
             torch_reduce_op=torch.sum,
             dim=dim,
@@ -97,6 +109,9 @@ def _run_reduce_sum(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
+            rtol=rtol,
+            atol=atol,
         )
 
     def test_reduce_sum(self):
@@ -174,6 +189,16 @@ def test_reduce_sum(self):
             input_type="float16",
             output_type="float16",
         )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[5, 4, 3],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+            use_fp16_acc=True,
+            rtol=1e-1,
+            atol=1e-1,
+        )
         self._run_reduce_sum(
             dim=2,
             input_shape=[5, 4, 3],
@@ -198,10 +223,17 @@ def test_reduce_sum(self):
         )
 
     def _run_reduce_mean(
-        self, *, dim, input_shape, keepdim, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        input_shape,
+        keepdim,
+        input_type="float16",
+        output_type=None,
+        use_fp16_acc=False,
     ):
         self._run_reduce(
-            test_name="reduce_mean",
+            test_name=f"reduce_mean_{input_type}_{output_type}",
             reduce_op=ops.reduce_mean,
             torch_reduce_op=torch.mean,
             dim=dim,
@@ -209,11 +241,16 @@ def _run_reduce_mean(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
         )
 
     def test_reduce_mean(self):
         self._run_reduce_mean(
-            dim=0, input_shape=[1], keepdim=True, input_type="float16", output_type=None
+            dim=0,
+            input_shape=[1],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
         )
         self._run_reduce_mean(
             dim=1,
@@ -362,6 +399,14 @@ def test_reduce_mean(self):
             input_type="float16",
             output_type="float16",
         )
+        self._run_reduce_mean(
+            dim=0,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="float16",
+            output_type="float16",
+            use_fp16_acc=True,
+        )
 
     def _run_batched_reduce(
         self,
@@ -375,9 +420,10 @@ def _run_batched_reduce(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
     ):
         torch.manual_seed(0)
-        logger.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
+        _LOGGER.info(f"Test {batch_sizes=}, {non_batch_shape=}, {dim=}")
         target = detect_target()
 
         batch0_dim = shape_utils.gen_int_var_min_max(batch_sizes, "batch_0")
@@ -400,17 +446,20 @@ def _run_batched_reduce(
         for batch_size in batch_sizes:
             input_shape = [batch_size] + non_batch_shape
             X_pt = get_random_torch_tensor(input_shape, input_type)
-            dtype_pt = dtype_to_torch_dtype(output_type)
+            dtype_pt = (
+                X_pt.dtype
+                if output_type is None
+                else string_to_torch_dtype(output_type)
+            )
             if keepdim is None:
                 Y_pt = torch_reduce_op(X_pt, dim, dtype=dtype_pt)
             else:
                 Y_pt = torch_reduce_op(X_pt, dim, keepdim=keepdim, dtype=dtype_pt)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
-            y_pt = Y_pt.cpu().numpy()
 
-            np.testing.assert_allclose(y_pt, y.cpu().numpy(), atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(Y_pt, y, atol=1e-2, rtol=1e-2)
             self.test_count += 1
 
     def _run_batched_reduce_sum(
@@ -422,9 +471,10 @@ def _run_batched_reduce_sum(
         keepdim,
         input_type="float16",
         output_type=None,
+        use_fp16_acc=False,
     ):
         self._run_batched_reduce(
-            test_name="reduce_sum_batched",
+            test_name=f"reduce_sum_batched_{input_type}_{output_type}",
             reduce_op=ops.reduce_sum,
             torch_reduce_op=torch.sum,
             dim=dim,
@@ -433,6 +483,7 @@ def _run_batched_reduce_sum(
             keepdim=keepdim,
             input_type=input_type,
             output_type=output_type,
+            use_fp16_acc=use_fp16_acc,
         )
 
     def test_batched_reduce_sum(self):
@@ -443,6 +494,109 @@ def test_batched_reduce_sum(self):
             keepdim=True,
             input_type="float16",
             output_type=None,
+            use_fp16_acc=True,
+        )
+        self._run_batched_reduce_sum(
+            dim=1,
+            batch_sizes=[10, 2048],
+            non_batch_shape=[2, 1944],
+            keepdim=True,
+            input_type="float16",
+            output_type=None,
+            use_fp16_acc=False,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reduce_sum_float32(self):
+        # reduce_smallaxis
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float32",
+            output_type=None,
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        # reduce_3d
+        self._run_reduce_sum(
+            dim=-2,
+            input_shape=[3, 2048, 4],
+            keepdim=False,
+            input_type="float32",
+            output_type=None,
+            rtol=4e-6,
+            atol=2e-5,
+        )
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[11, 4096, 2],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        # reduce (common) 2d
+        self._run_reduce_sum(
+            dim=-1,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="float32",
+            output_type=None,
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+        self._run_reduce_sum(
+            dim=0,
+            input_shape=[1231, 1234],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            rtol=1.3e-6,
+            atol=1e-5,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reduce_sum_bfloat16(self):
+        # reduce_smallaxis
+        self._run_reduce_sum(
+            dim=1,
+            input_shape=[1, 4],
+            keepdim=True,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-1,
+            atol=1e-1,
+        )
+        # reduce_3d
+        self._run_reduce_sum(
+            dim=-2,
+            input_shape=[3, 2048, 4],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-1,
+            atol=1e-1,
+        )
+        # reduce (common) 2d
+        self._run_reduce_sum(
+            dim=-1,
+            input_shape=[1270, 1223],
+            keepdim=False,
+            input_type="bfloat16",
+            output_type=None,
+            rtol=1e-0,
+            atol=1e-0,
         )
 
 
diff --git a/tests/unittest/ops/test_relational.py b/tests/unittest/ops/test_relational.py
new file mode 100644
index 000000000..e7e6ee724
--- /dev/null
+++ b/tests/unittest/ops/test_relational.py
@@ -0,0 +1,170 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.public import FuncEnum
+from aitemplate.frontend import IntVar, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from parameterized import param, parameterized
+
+ait_to_torch_map = {
+    ops.ge: torch.ge,
+    ops.le: torch.le,
+    ops.gt: torch.gt,
+    ops.lt: torch.lt,
+    ops.eq: torch.eq,
+    ops.ne: torch.ne,
+}
+
+
+def get_test_cases(dtype: str):
+    return [
+        param(ops.le, "le", dtype, 3),
+        param(ops.lt, "lt", dtype, 4),
+    ]
+
+
+class TestRelational(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    param(ops.le, "le", "float16", 3),
+                    param(ops.le, "lt", "float16", 3),
+                ],
+                TestEnv.CUDA_SM80: [
+                    param(ops.le, "le", "bfloat16", 3),
+                    param(ops.le, "lt", "bfloat16", 3),
+                    param(ops.gt, "gt", "float32", 8),
+                    param(ops.ne, "ne", "float", 1),
+                    param(ops.eq, "eq", "float", 16),
+                ],
+                TestEnv.ROCM: [
+                    param(ops.le, "le", "float16", 3),
+                    param(ops.le, "lt", "float16", 3),
+                ],
+            }
+        )
+    )
+    def test_end_to_end(
+        self, operator: type, test_name: str, dtype: str, M: int
+    ) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        add = ops.elementwise(FuncEnum.ADD)(X1, X2)
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = operator()(add, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_relational_{test_name}")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            add_pt = x1_pt + x2_pt
+            y_pt = ait_to_torch_map[operator](add_pt, x3_pt)
+            y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+            inputs = {"X1": x1_pt, "X2": x2_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            self.assertEqual(y_pt.tolist(), y.tolist())
+
+    def test_unsupport_type_promotion(self) -> None:
+        dim = IntVar([1, 128])
+        X1 = Tensor([dim, 10], name="X1", is_input=True, dtype="float16")
+        X2 = Tensor([dim, 10], name="X2", is_input=True, dtype="float32")
+        with self.assertRaisesRegex(
+            AssertionError, "Type promotions are not supported"
+        ):
+            ops.ge()(X1, X2)
+
+    def test_unsupport_different_shapes(self) -> None:
+        X1 = Tensor([IntVar([1, 128]), 10], name="X1", is_input=True, dtype="float16")
+        X2 = Tensor([IntVar([10, 128]), 10], name="X2", is_input=True, dtype="float16")
+        with self.assertRaisesRegex(
+            AssertionError,
+            "Relational does not support broadcasting yet. It expects tensor of same shape",
+        ):
+            ops.ge()(X1, X2)
+
+    def test_constant(self) -> None:
+        X1 = Tensor([IntVar([1, 128]), 10], name="X1", is_input=True, dtype="float16")
+        X2 = 2
+        Y = ops.ge()(X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_relational_test_constant")
+
+        x1_pt = get_random_torch_tensor([128, 10], dtype="float16")
+        inputs = {"X1": x1_pt}
+        y_pt = ait_to_torch_map[ops.ge](x1_pt, 2)
+        y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+        model.run_with_tensors(inputs, [y])
+        self.assertEqual(y_pt.tolist(), y.tolist())
+
+    @parameterized.expand(
+        [
+            param("int32", 3),
+            param("int32", 2),
+            param("int64", 3),
+            param("int64", 2),
+        ]
+    )
+    def test_int_support(self, dtype: str, M: int) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        Y = ops.ge()(X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_relational_int_{dtype}_{M}")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], "float32").to(
+                torch.int32 if dtype == "int32" else torch.int64
+            )
+            x2_pt = get_random_torch_tensor([batch, M], "float32").to(
+                torch.int32 if dtype == "int32" else torch.int64
+            )
+            y_pt = ait_to_torch_map[ops.ge](x1_pt, x2_pt)
+            y = get_torch_empty_tensor(y_pt.size(), dtype="bool")
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            self.assertEqual(y_pt.tolist(), y.tolist())
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_reshape.py b/tests/unittest/ops/test_reshape.py
index cfff48af1..724beb583 100644
--- a/tests/unittest/ops/test_reshape.py
+++ b/tests/unittest/ops/test_reshape.py
@@ -16,26 +16,61 @@
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model
-from aitemplate.compiler.ops.common.view_ops import reshape
+from aitemplate.compiler import compile_model, ops
+from aitemplate.compiler.base import IntVarTensor
 
 from aitemplate.frontend import IntImm, IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 class ReshapeTestCase(unittest.TestCase):
-    def _test_fp16(
+    def _infer_shape(self, x, shape):
+        new_shape = list(shape)
+        cur_shape = x
+        unknown_idx = -1
+        prod = 1
+        for idx, v in enumerate(new_shape):
+            if v == -1:
+                # no multiple -1s
+                assert unknown_idx == -1
+                unknown_idx = idx
+            else:
+                prod *= v
+        numel = 1
+        for dim in cur_shape:
+            numel *= dim
+
+        if unknown_idx == -1:
+            assert (
+                numel == prod
+            ), f"When there is no unknown index, we expect dim products to be equal, got current shape {numel=} != new shape {prod=}"
+        else:
+            # FIXME: note that this RuntimeError rules out some "valid" PyTorch
+            # code like:
+            # t = torch.arange(0).reshape(4, 0)
+            # this is valid in PT but would trigger RuntimeError below
+            # t.reshape(2, 2, -1)
+            # We can fix it later.
+            if prod <= 0:
+                raise RuntimeError(f"cannot reshape tensor {x} with shape {shape}")
+            assert numel % prod == 0
+            new_shape[unknown_idx] = numel // prod
+        return new_shape
+
+    def _test_reshape(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="reshape",
+        input_type="float16",
     ):
         target = detect_target()
         # N, H, W, C
         X = Tensor(
             shape=[IntVar(values=list(batch_size), name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=input_type,
             name="input_0",
             is_input=True,
         )
@@ -57,36 +92,36 @@ def _test_fp16(
         for b in batch_size:
             # C, H, W
             X_shape_pt = (X_shape[2], X_shape[0], X_shape[1])
-            X_pt = torch.randn(b, *X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(shape=(b, *X_shape_pt), dtype=input_type)
             OP_pt = torch.nn.AvgPool2d(kernel_size=3, stride=1, padding=1)
             Y1_pt = OP_pt(X_pt).permute([0, 2, 3, 1])
             Y2_pt = torch.reshape(Y1_pt, shape)  # reshape 1
             Y_pt = torch.reshape(Y2_pt, shape + [1])  # reshape 2
 
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([x], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def _test_fp16_single_op(
+    def _test_reshape_single_op(
         self,
-        X_shape,
-        Y_shape,
+        X_shape=(16, 32, 64),
+        Y_shape=(-1, 16, 16, 128),
         test_name="reshape",
         check_name_retention=False,
+        input_type="float16",
     ):
         target = detect_target()
         X_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in X_shape]
         Y_shape = [dim if isinstance(dim, IntVar) else IntImm(dim) for dim in Y_shape]
         X = Tensor(
             shape=X_shape,
-            dtype="float16",
+            dtype=input_type,
             name="input_0",
             is_input=True,
         )
 
         OP = nn.Reshape()
-        OP_backend = reshape()
         Y = OP(X, Y_shape)
 
         Y._attrs["name"] = "output_0"
@@ -100,15 +135,16 @@ def _test_fp16_single_op(
         if len(x_shapes) > len(new_shapes):
             assert len(new_shapes) == 1
             new_shapes = new_shapes * len(x_shapes)
+
         y_shapes = [
-            OP_backend._infer_shape(x_shape, new_shape)
+            self._infer_shape(x_shape, new_shape)
             for x_shape, new_shape in zip(x_shapes, new_shapes)
         ]
 
         for x_shape, y_shape in zip(x_shapes, y_shapes):
-            X_pt = torch.randn(x_shape).cuda().half()
+            X_pt = get_random_torch_tensor(x_shape, input_type)
             Y_pt = torch.reshape(X_pt, y_shape)
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
         if check_name_retention:
@@ -118,51 +154,83 @@ def _test_fp16_single_op(
             )
 
     def test_reshape(self):
-        self._test_fp16(test_name="reshape0")
-        self._test_fp16([4, 2], (4, 8, 8), (-1,), "reshape1")
-        self._test_fp16([3, 1], (5, 4, 16), (-1, 8), "reshape2")
-        self._test_fp16_single_op(
+        self._test_reshape(test_name="reshape0")
+        self._test_reshape([4, 2], (4, 8, 8), (-1,), "reshape1")
+        self._test_reshape([3, 1], (5, 4, 16), (-1, 8), "reshape2")
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(1, 3), name="input_batch"), 16, 32, 64),
             Y_shape=(-1, 16, 16, 128),
             test_name="reshape3",
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(1, 16, 32, 64), Y_shape=[1, 64, 16, 32], test_name="reshape4"
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 0, 8),
             Y_shape=(0, 2, 4),
             test_name="reshape1",
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(5, 4, -1, 3, 2),
             test_name="reshape_name",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
-            Y_shape=(5, 4, IntVar(values=(2, 4)), 3, -1),
+            Y_shape=(5, 4, IntVar(values=(2, 4), name="input_batch"), 3, -1),
             test_name="reshape_name_unknown_static_dim",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
             Y_shape=(5, IntVar(values=(2, 4)), 3, 4, 2),
             test_name="reshape_name_no_unknown_dims",
             check_name_retention=True,
         )
-        self._test_fp16_single_op(
-            X_shape=(IntVar(values=(2, 4), name="input_batch"), 1, 120),
-            Y_shape=(IntVar(values=(10, 20)), 4, 2, 3, -1),
-            test_name="reshape_squeeze_intvar_dim",
-        )
-        self._test_fp16_single_op(
+        self._test_reshape_single_op(
             X_shape=(IntVar(values=(20, 40), name="input_batch"), 1, 12),
             Y_shape=(4, 2, IntVar(values=(2, 4)), 3, 5),
             test_name="reshape_unsqueeze_intvar_dim",
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_reshape_float32(self):
+        self._test_reshape_single_op(input_type="float32", test_name="reshape_float32")
+
+    def _test_reshape_shape(self, in_shape, out_shape, target_shape):
+        X = Tensor(
+            shape=in_shape,
+            name="input_0",
+            is_input=True,
+        )
+
+        OP = nn.Reshape()
+        Y = OP(X, target_shape)
+
+        y_shape = Y.shape()
+        self.assertEqual(len(y_shape), len(out_shape))
+        for y, o in zip(y_shape, out_shape):
+            self.assertEqual(y, o)
+
+    def test_reshape_shape_symbolic(self):
+        dummy_shape = Tensor(
+            shape=[1, 2],
+            name="dummy_shape",
+            is_input=True,
+        )
+        var1 = IntVar(values=[2, 4], name="var1")
+        tensor1 = IntVarTensor(var1)
+        X_shape = [var1, IntImm(256)]
+
+        intvar = [ops.size()(dummy_shape, idx) for idx in range(2)]
+
+        target_shape = [intvar[1] * tensor1, IntImm(-1)]
+        outdim0 = IntVar(values=[4, 8])
+        outdim0._attrs["symbolic_value"] = var1._attrs["symbolic_value"] * 2
+        answer_shape = [outdim0, IntImm(128)]
+        self._test_reshape_shape(X_shape, answer_shape, target_shape)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_roi_align.py b/tests/unittest/ops/test_roi_align.py
index 979897181..53c3911e0 100644
--- a/tests/unittest/ops/test_roi_align.py
+++ b/tests/unittest/ops/test_roi_align.py
@@ -21,6 +21,8 @@
 
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 try:
     import torchvision
@@ -40,7 +42,7 @@ def random_boxes(num_boxes, max_coord=100):
 
 @skipIfNoTorchVision
 class RoiAlignTestCase(unittest.TestCase):
-    def _create_tensors(self, num_rois, b, rand=False):
+    def _create_tensors(self, num_rois, b, rand=False, dtype="float16"):
         if rand:
             boxes = random_boxes(num_rois, 200)
             inds = np.arange(b)
@@ -48,25 +50,22 @@ def _create_tensors(self, num_rois, b, rand=False):
             rois = torch.cat(
                 (torch.tensor(batch_inds).reshape(b, -1, 1), boxes.reshape(b, -1, 4)), 2
             )
-            rois = rois.reshape(-1, 5).cuda().half()
+            rois = rois.reshape(-1, 5).cuda()
         else:
-            rois = (
-                torch.tensor(
-                    [
-                        [0, -2.0, -2.0, 22.0, 22.0],
-                        [0, 10.0, 10.0, 30.0, 30.0],
-                        [0, 1.0, 1.0, 10.0, 10.0],
-                        [1, -2.0, -2.0, 22.0, 22.0],
-                        [1, 10.0, 10.0, 30.0, 30.0],
-                        [1, 1.0, 1.0, 10.0, 10.0],
-                    ]
-                )
-                .cuda()
-                .half()
-            )
-        return rois
-
-    def _test_fp16_single_op(
+            rois = torch.tensor(
+                [
+                    [0, -2.0, -2.0, 22.0, 22.0],
+                    [0, 10.0, 10.0, 30.0, 30.0],
+                    [0, 1.0, 1.0, 10.0, 10.0],
+                    [1, -2.0, -2.0, 22.0, 22.0],
+                    [1, 10.0, 10.0, 30.0, 30.0],
+                    [1, 1.0, 1.0, 10.0, 10.0],
+                ]
+            ).cuda()
+        torch_dtype = string_to_torch_dtype(dtype)
+        return rois.to(dtype=torch_dtype)
+
+    def _test_single_op(
         self,
         HH,
         WW,
@@ -77,20 +76,21 @@ def _test_fp16_single_op(
         sampling_ratio=2,
         batch_size=(1, 1),
         rand=False,
-        test_name="roi_align",
+        test_name="roi_align_fp16",
+        dtype="float16",
     ):
         target = detect_target()
 
         X = Tensor(
             shape=[IntVar(values=list(batch_size), name="input_batch"), HH, WW, CC],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
 
         R = Tensor(
             shape=[IntVar(values=[num_rois, num_rois], name="roi_batch"), 5],
-            dtype="float16",
+            dtype=dtype,
             name="input_1",
             is_input=True,
         )
@@ -110,8 +110,8 @@ def _test_fp16_single_op(
         module = compile_model(Y, target, "./tmp", test_name)
 
         for b in batch_size:
-            X_pt = torch.randn(b, CC, WW, HH).cuda().half()
-            rois = self._create_tensors(num_rois, b, rand)
+            X_pt = get_random_torch_tensor([b, CC, WW, HH], dtype=dtype)
+            rois = self._create_tensors(num_rois, b, rand, dtype=dtype)
 
             if b == 1:
                 rois = rois[:num_rois, :]
@@ -121,16 +121,31 @@ def _test_fp16_single_op(
             )
             Y_pt = OP_pt(X_pt, rois)
             x = X_pt.permute((0, 2, 3, 1)).contiguous()
-            inputs = [x, rois]
-            y = torch.empty([num_rois, pooled_size, pooled_size, CC]).cuda().half()
-            module.run_with_tensors(inputs, [y])
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
+            module.run_with_tensors([x, rois], [y])
             y_transpose = y.permute((0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-1, rtol=1e-1))
 
-    def test_roi_align(self):
-        self._test_fp16_single_op(HH=56, WW=56, CC=256, test_name="roi_align1")
+    def test_roi_align_fp16(self):
+        self._test_single_op(
+            HH=56,
+            WW=56,
+            CC=256,
+            test_name="roi_align1",
+            dtype="float16",
+        )
         # self._test_fp16_single_op(HH=16, WW=16, CC=32, num_rois=6, batch_size=(2, 2), rand=True, test_name="roi_align2")
 
+    def test_roi_align_fp32(self):
+        self._test_single_op(
+            HH=56,
+            WW=56,
+            CC=256,
+            test_name="roi_align1",
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_size_getitem_ops.py b/tests/unittest/ops/test_size_getitem_ops.py
index c224d09f5..fdb08f346 100644
--- a/tests/unittest/ops/test_size_getitem_ops.py
+++ b/tests/unittest/ops/test_size_getitem_ops.py
@@ -15,28 +15,34 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 from aitemplate.utils import shape_utils
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class SizeOpTestCase(unittest.TestCase):
+class SizeGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SizeGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_size_op(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="size_op",
+        dtype="float16",
     ):
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -48,30 +54,66 @@ def _test_size_op(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y_pt = X_pt.reshape(b, -1, X_shape_pt[-1])
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def _test_size_op_2(
+    def test_size_op_fp16(self):
+        self._test_size_op(
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [1],
+            (4, 8, 8),
+            (-1,),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [4, 2],
+            (4, 8, 8),
+            (-1,),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+        self._test_size_op(
+            [3, 1],
+            (5, 4, 16),
+            (-1, 8),
+            test_name="size_op_fp16",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_size_op_fp32(self):
+        self._test_size_op(
+            test_name="size_op_fp32",
+            dtype="float32",
+        )
+
+    def _test_tensor_size_op(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         Y_shape=(-1, 16, 16, 128),
         test_name="tensor_size_op",
         copy_op=False,
+        dtype="float16",
     ):
         target = detect_target()
         X1 = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -90,30 +132,41 @@ def _test_size_op_2(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         self.assertEqual(len(module.debug_sorted_graph), 6)
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y2_pt = X_pt * X_pt
             Y_pt = Y2_pt.reshape(2 * b, -1)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_size_op(self):
-        self._test_size_op(test_name="size_op_0")
-        self._test_size_op([1], (4, 8, 8), (-1,), "size_op_static")
-        self._test_size_op([4, 2], (4, 8, 8), (-1,), "size_op_1")
-        self._test_size_op([3, 1], (5, 4, 16), (-1, 8), "size_op_2")
+    def test_tensor_size_op_fp16(self):
+        self._test_tensor_size_op(
+            test_name="tensor_size_op_fp16",
+            dtype="float16",
+        )
+        self._test_tensor_size_op(
+            copy_op=True,
+            test_name="tensor_size_op_fp16_copy_op",
+            dtype="float16",
+        )
 
-        self._test_size_op_2(test_name="size_op_3")
-        self._test_size_op_2(test_name="size_op_3_copy_op", copy_op=True)
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_tensor_size_op_fp32(self):
+        self._test_tensor_size_op(
+            test_name="tensor_size_op_fp32",
+            dtype="float32",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_slice.py b/tests/unittest/ops/test_slice.py
index 82ee06d45..09ebb3775 100644
--- a/tests/unittest/ops/test_slice.py
+++ b/tests/unittest/ops/test_slice.py
@@ -26,9 +26,18 @@
 from aitemplate.utils import shape_utils
 
 
-class SliceTestCase(unittest.TestCase):
+class DynamicSliceTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(DynamicSliceTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
+
     def _run_dynamic_slice(
-        self, *, input_shape, start_indices, end_indices, input_type="float16"
+        self,
+        *,
+        input_shape,
+        start_indices,
+        end_indices,
+        input_type="float16",
     ):
         logging.info(
             "Test with start_indices {}, end_indices {}".format(
@@ -58,11 +67,118 @@ def _run_dynamic_slice(
         logging.info("AITemplate output_0 shape: {}".format(y_shape))
         np.testing.assert_equal(y_shape, Y_pt.size())
 
-        module = compile_model(Y, target, "./tmp", "dynamic_slice")
+        module = compile_model(Y, target, "./tmp", f"dynamic_slice_{self.test_count}")
+
+        y_ait = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y_ait])
+        self.assertTrue(torch.allclose(Y_pt, y_ait, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
-        y = torch.empty(y_shape).cuda().half()
-        module.run_with_tensors([X_pt], [y])
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+    def test_dynamic_slice(self):
+        self._run_dynamic_slice(
+            input_shape=[10, 13],
+            start_indices=[None, None],
+            end_indices=[None, None],
+        )
+        self._run_dynamic_slice(
+            input_shape=[1],
+            start_indices=[0],
+            end_indices=[1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2],
+            start_indices=[0],
+            end_indices=[-1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3],
+            start_indices=[0, 0],
+            end_indices=[2, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[0, 0, 0],
+            end_indices=[2, 2, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[1, 0, 1],
+            end_indices=[2, 2, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 0, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 0, 0],
+            end_indices=[1, 3, 4],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 0],
+            end_indices=[1, 3, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, 1, 1],
+            end_indices=[-11, 3, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[0, -3, -4],
+            end_indices=[9, -1, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 4],
+            start_indices=[4, 0, 1],
+            end_indices=[1, 1, 2],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2048, 256, 64],
+            start_indices=[256, 32, 0],
+            end_indices=[1024, 193, 65],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+        )
+        self._run_dynamic_slice(
+            input_shape=[2, 3],
+            start_indices=[IntVar([1, 1]), IntImm(1)],
+            end_indices=[IntVarTensor(IntImm(2)), IntVarTensor(IntImm(2))],
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_slice_float32(self):
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+            input_type="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_dynamic_slice_bfloat16(self):
+        self._run_dynamic_slice(
+            input_shape=[2, 3, 5],
+            start_indices=[None, 0, 0],
+            end_indices=[2, None, -1],
+            input_type="bfloat16",
+        )
+
+
+class DynamicSliceBatchedTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(DynamicSliceBatchedTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_batch_dynamic_slice(
         self,
@@ -96,7 +212,9 @@ def _run_batch_dynamic_slice(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", "dynamic_slice_batched")
+        module = compile_model(
+            Y, target, "./tmp", f"dynamic_slice_batched_{self.test_count}"
+        )
 
         for batch in batch_sizes:
             logging.info("checking batch: {}".format(batch))
@@ -105,60 +223,18 @@ def _run_batch_dynamic_slice(
             X_pt = get_random_torch_tensor([batch, *input_shape], input_type)
             slice_indices = [slice(i, j) for i, j in zip(start_indices, end_indices)]
             Y_pt = X_pt[slice_indices]
-            y_pt = Y_pt.cpu().numpy()
-
-            y = torch.empty(y_pt.shape).cuda().half()
-            module.run_with_tensors([X_pt], [y])
-            self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
-
-    def test_dynamic_slice(self):
-        self._run_dynamic_slice(input_shape=[1], start_indices=[0], end_indices=[1])
-        self._run_dynamic_slice(input_shape=[2], start_indices=[0], end_indices=[-1])
-        self._run_dynamic_slice(
-            input_shape=[2, 3], start_indices=[0, 0], end_indices=[2, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 5], start_indices=[0, 0, 0], end_indices=[2, 2, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[1, 0, 1], end_indices=[2, 2, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 0, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 0, 0], end_indices=[1, 3, 4]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 0], end_indices=[1, 3, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, 1, 1], end_indices=[-11, 3, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[0, -3, -4], end_indices=[9, -1, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 4], start_indices=[4, 0, 1], end_indices=[1, 1, 2]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2048, 256, 64],
-            start_indices=[256, 32, 0],
-            end_indices=[1024, 193, 65],
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3, 5], start_indices=[None, 0, 0], end_indices=[2, None, -1]
-        )
-        self._run_dynamic_slice(
-            input_shape=[2, 3],
-            start_indices=[IntVar([1, 1]), IntImm(1)],
-            end_indices=[IntVarTensor(IntImm(2)), IntVarTensor(IntImm(2))],
-        )
+            y_ait = torch.empty_like(Y_pt)
+            module.run_with_tensors([X_pt], [y_ait])
+            self.assertTrue(torch.allclose(Y_pt, y_ait, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def test_batch_dynamic_slice(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 20],
+            input_shape=[2, 3, 4],
+            start_indices=[None, None, None, None],
+            end_indices=[None, None, None, None],
+        )
         self._run_batch_dynamic_slice(
             batch_sizes=[1, 1],
             input_shape=[1],
@@ -214,6 +290,26 @@ def test_batch_dynamic_slice(self):
             end_indices=[None, None, -1, 0],
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_dynamic_slice_float32(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[None, 1, None, -1],
+            end_indices=[None, None, -1, 0],
+            input_type="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_batch_dynamic_slice_bfloat16(self):
+        self._run_batch_dynamic_slice(
+            batch_sizes=[5, 3, 9],
+            input_shape=[2, 4, 3],
+            start_indices=[None, 1, None, -1],
+            end_indices=[None, None, -1, 0],
+            input_type="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_softmax.py b/tests/unittest/ops/test_softmax.py
index 70b6f8e44..53b0c416e 100644
--- a/tests/unittest/ops/test_softmax.py
+++ b/tests/unittest/ops/test_softmax.py
@@ -15,7 +15,12 @@
 """
 Unittests for LayerNorm Operator.
 """
+import json
+import math
+import tempfile
 import unittest
+from collections import namedtuple
+from statistics import mean
 
 import torch
 
@@ -23,10 +28,14 @@
 from aitemplate.compiler.base import IntVar
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.profile import profile_callable
+from aitemplate.testing.test_utils import filter_test_cases_by_params, TestEnv
+from aitemplate.utils.torch_utils import string_to_torch_dtype
+from parameterized import parameterized
 
 
 class SoftmaxTestCase(unittest.TestCase):
-    def _test_softmax(
+    def _build_model(
         self,
         batch_sizes=(1, 1024),
         input_shapes=(6,),
@@ -34,6 +43,11 @@ def _test_softmax(
         dtype="float16",
         testname="softmax",
     ):
+        target = detect_target()
+        if target.name() == "rocm" and dtype != "float16":
+            self.skipTest(f"Rocm doesn't support {dtype}")
+        if target.name() == "cuda" and dtype == "bfloat16" and int(target._arch) < 80:
+            self.skipTest(f"CUDA SM{target._arch} doesn't support {dtype}")
 
         X = Tensor(
             shape=[IntVar(name="input_batch", values=list(batch_sizes)), *input_shapes],
@@ -43,32 +57,180 @@ def _test_softmax(
         )
         Y = ops.softmax()(X, dim)
         Y._attrs["is_output"] = True
-        Y._attrs["name"] = "output"
+        Y._attrs["name"] = "Y"
 
-        target = detect_target()
-        module = compile_model(Y, target, "./tmp", testname)
+        return compile_model(Y, target, "./tmp", testname)
+
+    def _test_softmax(
+        self,
+        batch_sizes=(1, 1024),
+        input_shapes=(6,),
+        dim=-1,
+        dtype="float16",
+        testname="softmax",
+    ):
+        module = self._build_model(batch_sizes, input_shapes, dim, dtype, testname)
+        torch_dtype = string_to_torch_dtype(dtype)
 
         for batch_size in batch_sizes:
-            x_pt = torch.randn(batch_size, *input_shapes).cuda().half()
+            x_pt = torch.randn(batch_size, *input_shapes, dtype=torch_dtype).cuda()
             y_pt = torch.nn.functional.softmax(x_pt, dim=dim)
 
-            y = torch.empty([batch_size, *input_shapes]).cuda().half()
+            y = torch.empty([batch_size, *input_shapes], dtype=torch_dtype).cuda()
             module.run_with_tensors([x_pt], [y])
-            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
-
-    def test_softmax(self):
-        self._test_softmax()
-        self._test_softmax(dim=1)
-        self._test_softmax((1, 13), (7,))
-        self._test_softmax((10, 1025), (16,))
-        self._test_softmax((1, 17), (9, 8))
-        self._test_softmax((2, 64), (9, 1, 6))
-        self._test_softmax((1, 4096), (33,))
-        self._test_softmax((2, 21), (34,))
-        self._test_softmax((2, 17), (36,))
-        self._test_softmax((1, 64), (128,))
-        self._test_softmax((2, 31), (513,))
+            torch.testing.assert_close(y_pt, y, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [
+                    ("dim_1_fp16", "float16", (1, 1024), (6,), 1),
+                    ("tail_shapes_all_1_fp16", "float16", (1, 2), (6, 1, 1), 1),
+                    ("tail_shapes_not_all_1_fp16", "float16", (1, 2), (6, 1, 2), 1),
+                    ("odd_small_fp16", "float16", (1, 13), (11,)),
+                    ("odd_mid_fp16", "float16", (1, 4096), (33,)),
+                    ("odd_large_fp16", "float16", (2, 31), (1409,)),
+                    ("k2_small_fp16", "float16", (1, 1024), (18,)),
+                    ("k2_mid_fp16", "float16", (2, 21), (66,)),
+                    ("k2_large_fp16", "float16", (2, 21), (1154,)),
+                    ("k4_small_fp16", "float16", (10, 1025), (124,)),
+                    ("k4_mid_fp16", "float16", (1, 17), (132,)),
+                    ("k4_large_fp16", "float16", (1, 17), (1924,)),
+                    ("k8_small_fp16", "float16", (10, 1025), (72,)),
+                    ("k8_mid_fp16", "float16", (1, 17), (264,)),
+                    ("k8_large_fp16", "float16", (1, 17), (3848,)),
+                    ("no_smem_fp16", "float16", (1, 2), (12500,)),
+                    ("2d", "float16", (1, 2), (100, 100)),
+                    ("3d", "float16", (1, 2), (24, 2, 64)),
+                    ("dim_1_fp32", "float32", (1, 2), (6,), 1),
+                    ("odd_small_fp32", "float32", (1, 2), (11,)),
+                    ("odd_mid_fp32", "float32", (1, 2), (33,)),
+                    ("odd_large_fp32", "float32", (1, 2), (1409,)),
+                    ("k2_small_fp32", "float32", (1, 2), (18,)),
+                    ("k2_mid_fp32", "float32", (1, 2), (66,)),
+                    ("k2_large_fp32", "float32", (1, 2), (1154,)),
+                    ("k4_small_fp32", "float32", (1, 2), (124,)),
+                    ("k4_mid_fp32", "float32", (1, 2), (132,)),
+                    ("k4_large_fp32", "float32", (1, 2), (1924,)),
+                    ("k8_small_fp32", "float32", (1, 2), (72,)),
+                    ("k8_mid_fp32", "float32", (1, 2), (264,)),
+                    ("k8_large_fp32", "float32", (1, 2), (3848,)),
+                    ("no_smem_fp32", "float32", (1, 2), (12500,)),
+                ],
+                TestEnv.CUDA_SM80: [
+                    ("dim_1_bf16", "bfloat16", (1, 2), (6,), 1),
+                    ("tail_shapes_all_1_bf16", "bfloat16", (1, 2), (6, 1, 1), 1),
+                    ("tail_shapes_not_all_1_bf16", "bfloat16", (1, 2), (6, 1, 2), 1),
+                    ("odd_small_bf16", "bfloat16", (1, 2), (11,)),
+                    ("odd_mid_bf16", "bfloat16", (1, 2), (33,)),
+                    ("odd_large_bf16", "bfloat16", (1, 2), (1409,)),
+                    ("k2_small_bf16", "bfloat16", (1, 2), (18,)),
+                    ("k2_mid_bf16", "bfloat16", (1, 2), (66,)),
+                    ("k2_large_bf16", "bfloat16", (1, 2), (1154,)),
+                    ("k4_small_bf16", "bfloat16", (1, 2), (124,)),
+                    ("k4_mid_bf16", "bfloat16", (1, 2), (132,)),
+                    ("k4_large_bf16", "bfloat16", (1, 2), (1924,)),
+                    ("k8_small_bf16", "bfloat16", (1, 2), (72,)),
+                    ("k8_mid_bf16", "bfloat16", (1, 2), (264,)),
+                    ("k8_large_bf16", "bfloat16", (1, 2), (3848,)),
+                    ("no_smem_bf16", "bfloat16", (1, 2), (12500,)),
+                ],
+            }
+        )
+    )
+    def test_softmax(
+        self,
+        testname="softmax",
+        dtype="float16",
+        batch_sizes=(1, 1024),
+        input_shapes=(6,),
+        dim=-1,
+    ):
+        self._test_softmax(
+            dtype=dtype,
+            testname=f"{testname}_{dtype}",
+            batch_sizes=batch_sizes,
+            input_shapes=input_shapes,
+            dim=dim,
+        )
+
+    def _test_benchmark_softmax(self):
+        dtype = "float16"
+        torch_dtype = string_to_torch_dtype(dtype)
+        BenchResult = namedtuple(
+            "BenchResult", ["dim", "batch_size", "permute_ms", "softmax_ms"]
+        )
+        results = []
+        shape = (260, 4)
+        batch_sizes = [2**p for p in range(0, 16)]
+        for reduction_dim in [-1, -2]:
+            module = self._build_model(
+                batch_sizes,
+                shape,
+                reduction_dim,
+                dtype,
+                f"bench_softmax_{abs(reduction_dim)}",
+            )
+
+            for batch_size in batch_sizes:
+                x_pt = torch.ones(batch_size, *shape, dtype=torch_dtype).cuda()
+                y_pt = torch.empty([batch_size, *shape], dtype=torch_dtype).cuda()
+                with tempfile.NamedTemporaryFile("r") as f:
+                    module.profile_with_tensors(
+                        inputs={"X": x_pt},
+                        outputs={"Y": y_pt},
+                        num_iters=1000,
+                        filename=f.name,
+                    )
+                    profiling_data = json.loads(f.read())
+
+                    permute_ms = 0
+                    softmax_ms = 0
+                    for func_name, record in profiling_data.items():
+                        if func_name.startswith("permute"):
+                            permute_ms += record["ms_per_iter"]
+                        elif func_name.startswith("softmax"):
+                            softmax_ms += record["ms_per_iter"]
+                    results.append(
+                        BenchResult(reduction_dim, batch_size, permute_ms, softmax_ms)
+                    )
+
+        for r in results:
+            items = r.batch_size * math.prod(shape)
+            runtime_ms = r.permute_ms + r.softmax_ms
+            print(
+                f"{r.dim=}, {items=}, {r.permute_ms=}, {r.softmax_ms=}, {runtime_ms=}"
+            )
+
+    def _test_benchmark_pytorch_softmax(self):
+        batch_sizes = [2**p for p in range(0, 16)]
+        shape = (260, 4)
+        dtype = "float16"
+        torch_dtype = string_to_torch_dtype(dtype)
+        BenchResult = namedtuple("BenchResult", ["dim", "batch_size", "runtime_ms"])
+        cache_flush_slab = torch.empty(
+            size=[40, 1024, 1024],  # A100 L2 cache size
+            dtype=torch.float16,
+        ).cuda()
+
+        results = []
+        for reduction_dim in [-1, -2]:
+            for batch_size in batch_sizes:
+                x_pt = torch.ones(batch_size, *shape, dtype=torch_dtype).cuda()
+                _, wall_times = profile_callable(
+                    lambda: torch.nn.functional.softmax(x_pt, dim=reduction_dim),
+                    cache_flush_slab,
+                    n_iter=1000,
+                )
+                results.append(
+                    BenchResult(reduction_dim, batch_size, mean(wall_times) / 1000.0)
+                )
+
+        for r in results:
+            items = r.batch_size * math.prod(shape)
+            print(f"{r.dim=}, {items=}, {r.runtime_ms=}")
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_split.py b/tests/unittest/ops/test_split.py
index cc96a043e..30073da3c 100644
--- a/tests/unittest/ops/test_split.py
+++ b/tests/unittest/ops/test_split.py
@@ -15,21 +15,30 @@
 import logging
 import unittest
 
-import numpy as np
 import torch
 
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.testing.test_utils import (
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+)
 
 
 class SplitTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(SplitTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_split(
-        self, *, input_shape, split_size_or_sections, dim=None, input_type="float16"
+        self,
+        *,
+        input_shape,
+        split_size_or_sections,
+        output_masks=None,
+        dim=None,
+        input_type="float16",
     ):
         logging.info(
             f"Test input shape {input_shape}, "
@@ -44,14 +53,28 @@ def _run_split(
             if dim is None
             else torch.split(X_pt, split_size_or_sections, dim)
         )
+        if output_masks is not None:
+            Ys_pt = [y_pt for idx, y_pt in enumerate(Ys_pt) if output_masks[idx]]
         target = detect_target()
         X = Tensor(shape=input_shape, dtype=input_type, name="input_0", is_input=True)
         Ys = (
-            split_op(X, split_size_or_sections)
+            split_op(
+                X,
+                split_size_or_sections,
+            )
             if dim is None
-            else split_op(X, split_size_or_sections, dim)
+            else split_op(
+                X,
+                split_size_or_sections,
+                dim,
+            )
         )
-        np.testing.assert_equal(len(Ys_pt), len(Ys))
+        if output_masks is not None:
+            split_op.remove_output_at(
+                [idx for idx, mask in enumerate(output_masks) if not mask]
+            )
+            Ys = split_op._attrs["outputs"]
+        self.assertEqual(len(Ys_pt), len(Ys))
 
         y_shapes = []
         for idx, Y in enumerate(Ys):
@@ -61,17 +84,19 @@ def _run_split(
             logging.info(f"AITemplate output_{idx} shape: {y_shape}")
             y_shapes.append(y_shape)
 
-        module = compile_model(Ys, target, "./tmp", "split")
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Ys, target, "./tmp", "split", dll_name=dll_name)
 
         outputs = {
-            f"output_{idx}": torch.empty(y_shape).cuda().half()
+            f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
             for idx, y_shape in enumerate(y_shapes)
         }
         module.run_with_tensors([X_pt], outputs)
 
         for idx, y_pt in enumerate(Ys_pt):
             y = outputs[f"output_{idx}"]
-            self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+            self.assertTrue(torch.equal(y_pt, y))
+        self.test_count += 1
 
     def _run_batch_split(
         self,
@@ -106,7 +131,8 @@ def _run_batch_split(
             Y._attrs["name"] = f"output_{idx}"
             Y._attrs["is_output"] = True
 
-        module = compile_model(Ys, target, "./tmp", "split")
+        dll_name = f"test_{self.test_count}.so"
+        module = compile_model(Ys, target, "./tmp", "split", dll_name=dll_name)
 
         for batch in batch_sizes:
             logging.info(f"checking batch: {batch}")
@@ -119,11 +145,11 @@ def _run_batch_split(
                 else torch.split(X_pt, split_size_or_sections, dim)
             )
 
-            np.testing.assert_equal(len(Ys_pt), len(Ys))
+            self.assertEqual(len(Ys_pt), len(Ys))
 
             y_shapes = [Y_pt.size() for Y_pt in Ys_pt]
             outputs = {
-                f"output_{idx}": torch.empty(y_shape).cuda().half()
+                f"output_{idx}": get_torch_empty_tensor(y_shape, input_type)
                 for idx, y_shape in enumerate(y_shapes)
             }
             module.run_with_tensors(
@@ -133,7 +159,8 @@ def _run_batch_split(
 
             for idx, y_pt in enumerate(Ys_pt):
                 y = outputs[f"output_{idx}"]
-                self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+                self.assertTrue(torch.equal(y_pt, y))
+            self.test_count += 1
 
     def test_split(self):
         self._run_split(input_shape=[1], split_size_or_sections=1, dim=0)
@@ -157,6 +184,26 @@ def test_split(self):
         self._run_split(input_shape=[2, 0, 4], split_size_or_sections=2, dim=-1)
         self._run_split(input_shape=[2, 0, 7], split_size_or_sections=[2, 3, 2], dim=-1)
 
+    def test_split_with_mask(self):
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=[2, 3, 3],
+            output_masks=[True, False, True],
+            dim=0,
+        )
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(5, 1),
+            output_masks=[True, False],
+            dim=1,
+        )
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(2, 2),
+            output_masks=[False, True],
+            dim=2,
+        )
+
     def test_batch_split(self):
         self._run_batch_split(
             batch_sizes=[1, 1], input_shape=[2, 1], split_size_or_sections=1, dim=1
@@ -187,6 +234,34 @@ def test_batch_split(self):
             dim=3,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_float(self):
+        self._run_split(
+            input_shape=[8, 6, 4],
+            split_size_or_sections=(2, 4),
+            dim=1,
+            input_type="float",
+        )
+        self._run_batch_split(
+            batch_sizes=[11, 5, 9],
+            input_shape=[2, 9, 4],
+            split_size_or_sections=[2, 4, 3],
+            dim=2,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+    def test_split_bfloat16(self):
+        self._run_split(
+            input_shape=[2, 3], split_size_or_sections=2, dim=1, input_type="bfloat16"
+        )
+        self._run_batch_split(
+            batch_sizes=[3, 4],
+            input_shape=[2, 3, 4],
+            split_size_or_sections=2,
+            dim=2,
+            input_type="bfloat16",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_split_getitem.py b/tests/unittest/ops/test_split_getitem.py
index a229063a4..c0f2437b4 100644
--- a/tests/unittest/ops/test_split_getitem.py
+++ b/tests/unittest/ops/test_split_getitem.py
@@ -15,16 +15,27 @@
 import unittest
 
 import torch
+
 from aitemplate.compiler import compile_model, ops
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
-
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
 from aitemplate.utils import shape_utils
+from parameterized import parameterized
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class SplitGetItemTestCase(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(SplitGetItemTestCase, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
     def _test_split_getitem(
         self,
         batch_size=(1, 3),
@@ -33,15 +44,14 @@ def _test_split_getitem(
         split_dim=1,
         item_idx=0,
         test_name="split_getitem",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -53,7 +63,12 @@ def _test_split_getitem(
         else:
             assert 0, f"expected split_dim to be either 1 or 2 but got {split_dim}"
 
-        W = Tensor(shape=[b_dim, N, K], dtype="float16", name="input_1", is_input=True)
+        W = Tensor(
+            shape=[b_dim, N, K],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
 
         Y1 = ops.split()(X, split_sections, split_dim)
         Y2 = ops.getitem()(Y1, item_idx)
@@ -62,12 +77,13 @@ def _test_split_getitem(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
-            W_pt = torch.randn(b, N, K).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+            W_pt = get_random_torch_tensor([b, N, K], dtype=dtype)
             WT = torch.transpose(W_pt, 2, 1)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
@@ -77,15 +93,27 @@ def _test_split_getitem(
             module.run_with_tensors({"input_0": X_pt, "input_1": W_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem(self):
-        self._test_split_getitem(test_name="split_getitem_0")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_split_getitem(self, dtype):
+        self._test_split_getitem(
+            test_name=f"split_getitem_{dtype}",
+            dtype=dtype,
+        )
         self._test_split_getitem(
             batch_size=[5],
             X_shape=(16, 32),
             split_sections=[8, 20, 4],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_1",
+            test_name=f"split_getitem_{dtype}",
+            dtype=dtype,
         )
 
     def _test_split_getitem_output(
@@ -95,16 +123,15 @@ def _test_split_getitem_output(
         split_sections=(4, 8, 2, 2),
         split_dim=1,
         item_idx=0,
-        test_name="split_getitem",
+        test_name="split_getitem_output",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -115,11 +142,12 @@ def _test_split_getitem_output(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
             Y_pt = Y1_pt[item_idx]
@@ -127,15 +155,27 @@ def _test_split_getitem_output(
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_getitem_output(self):
-        self._test_split_getitem_output(test_name="split_getitem_output_0")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_split_getitem_output(self, dtype):
+        self._test_split_getitem_output(
+            test_name="split_getitem_output",
+            dtype=dtype,
+        )
         self._test_split_getitem_output(
             batch_size=[10],
             X_shape=(16, 31),
             split_sections=[9, 19, 3],
             split_dim=2,
             item_idx=1,
-            test_name="split_getitem_output_1",
+            test_name="split_getitem_output",
+            dtype=dtype,
         )
 
     def _test_split_multiple_getitems(
@@ -144,21 +184,18 @@ def _test_split_multiple_getitems(
         X_shape=(16, 32),
         split_sections=(4, 4, 6, 2),
         split_dim=1,
-        test_name="split_getitem",
+        test_name="split_multiple_getitems",
+        dtype="float16",
     ):
-        assert len(X_shape) == 2, "expected X_shape to be 2 but got {}".format(
-            len(X_shape)
-        )
+        assert len(X_shape) == 2, f"expected X_shape to be 2 but got {X_shape}"
         assert (
             len(split_sections) >= 2
-        ), "expected split_sections to have at least 2 values, but got {}".format(
-            split_sections
-        )
+        ), f"expected split_sections to have at least 2 values, but got {split_sections}"
         target = detect_target()
         b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
         X = Tensor(
             shape=[b_dim, *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -173,7 +210,7 @@ def _test_split_multiple_getitems(
         X2_shape[split_dim - 1] = split_sections[item_idx0]
         X2 = Tensor(
             shape=[b_dim, *X2_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_2",
             is_input=True,
         )
@@ -188,13 +225,14 @@ def _test_split_multiple_getitems(
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
 
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             X2_shape_pt = (b, *X2_shape)
-            X2_pt = torch.randn(X2_shape_pt).cuda().half()
+            X2_pt = get_random_torch_tensor(X2_shape_pt, dtype=dtype)
 
             Y1_pt = torch.split(X_pt, split_sections, split_dim)
             Y2_pt = Y1_pt[item_idx0]
@@ -207,16 +245,29 @@ def _test_split_multiple_getitems(
             module.run_with_tensors({"input_0": X_pt, "input_2": X2_pt}, [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_split_mutiple_getitems(self):
-        self._test_split_multiple_getitems(test_name="split_multiple_getitems_0")
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16")],
+                TestEnv.CUDA_SM80: [("float32")],
+            }
+        )
+    )
+    def test_split_mutiple_getitems(self, dtype):
+        self._test_split_multiple_getitems(
+            test_name=f"split_multiple_getitems_{dtype}",
+            dtype=dtype,
+        )
         self._test_split_multiple_getitems(
             batch_size=[10],
             X_shape=(16, 31),
             split_sections=[9, 9, 13],
             split_dim=2,
-            test_name="split_multiple_getitems_1",
+            test_name=f"split_multiple_getitems_{dtype}",
+            dtype=dtype,
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_squeeze.py b/tests/unittest/ops/test_squeeze.py
index d20d01bea..90a1c2a85 100644
--- a/tests/unittest/ops/test_squeeze.py
+++ b/tests/unittest/ops/test_squeeze.py
@@ -23,6 +23,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 def _construct_shape(
@@ -44,17 +45,19 @@ def _construct_shape(
 
 
 class SqueezeTestCase(unittest.TestCase):
-    def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
+    def _test_helper(
+        self, dim, shape, expected_shape, test_name, do_squeeze, input_type="float16"
+    ):
         target = detect_target()
 
         shape_vars, input_0_names = _construct_shape(shape, 0)
         expected_shape_vars, input_1_names = _construct_shape(expected_shape, 0)
 
         input_0 = Tensor(
-            shape=shape_vars, dtype="float16", name="input_0", is_input=True
+            shape=shape_vars, dtype=input_type, name="input_0", is_input=True
         )
         input_1 = Tensor(
-            shape=expected_shape_vars, dtype="float16", name="input_1", is_input=True
+            shape=expected_shape_vars, dtype=input_type, name="input_1", is_input=True
         )
 
         if do_squeeze:
@@ -76,8 +79,8 @@ def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
         all_input_1_shapes = itertools.product(*expected_shape)
 
         for input_0_shape, input_1_shape in zip(all_input_0_shapes, all_input_1_shapes):
-            input_0_pt = torch.randn(input_0_shape).cuda().half()
-            input_1_pt = torch.randn(input_1_shape).cuda().half()
+            input_0_pt = get_random_torch_tensor(input_0_shape, input_type)
+            input_1_pt = get_random_torch_tensor(input_1_shape, input_type)
             if do_squeeze:
                 # For some reason, torch.squeeze(X_pt, dim) fails when
                 # dim is None (even though the docs say dim is Optional[int])!
@@ -91,8 +94,9 @@ def _test_helper(self, dim, shape, expected_shape, test_name, do_squeeze):
             output_pt = torch.mul(Y_pt, input_1_pt)
             inputs = [input_0_pt, input_1_pt]
 
-            output = torch.empty(input_1_shape).cuda().half()
+            output = torch.empty_like(input_1_pt)
             module.run_with_tensors(inputs, [output])
+            # outputs from view ops must be exactly equal
             self.assertTrue(torch.equal(output, output_pt))
 
     def test_squeeze(self):
@@ -128,6 +132,28 @@ def test_unsqueeze(self):
             False,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_squeeze_float32(self):
+        self._test_helper(
+            2,
+            [[4, 2], [4], [1], [8]],
+            [[4, 2], [4], [8]],
+            "squeeze_float32",
+            True,
+            input_type="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_unsqueeze_float32(self):
+        self._test_helper(
+            0,
+            [[4, 3], [1], [2], [1]],
+            [[1], [4, 3], [1], [2], [1]],
+            "unsqueeze_float32",
+            False,
+            input_type="float32",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/unittest/ops/test_topk.py b/tests/unittest/ops/test_topk.py
index c963d1bd2..0b6240246 100644
--- a/tests/unittest/ops/test_topk.py
+++ b/tests/unittest/ops/test_topk.py
@@ -23,13 +23,18 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 class topkTestCase(unittest.TestCase):
-    def _create_tensors(self, shape):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.test_count = 0
+
+    def _create_tensors(self, shape, dtype):
         N = np.prod(shape)
         scores = torch.randperm(N) / N
-        return scores.reshape(shape).cuda().half()
+        return scores.reshape(shape).cuda().to(dtype=string_to_torch_dtype(dtype))
 
     def _test_topk(
         self,
@@ -39,44 +44,59 @@ def _test_topk(
         topK=100,
         test_name="topk",
         copy_op=False,
+        dtype="float16",
     ):
-
         o_shape = list(shape)
         o_shape[-1] = topK
 
         X1 = Tensor(
             shape=shape,
-            dtype="float16",
+            dtype=dtype,
             name="X",
             is_input=True,
         )
+        X5 = Tensor(
+            shape=shape,
+            dtype=dtype,
+            name="Y",
+            is_input=True,
+        )
         OP = ops.topk(k=topK)
         if copy_op:
             OP = ops.topk(**OP._get_op_attributes())
-        X4 = OP(X1)
+        X4, X5 = OP(X1)
+        X4._attrs["is_output"] = True
         X4._attrs["is_output"] = True
         X4._attrs["name"] = "output"
+        X5._attrs["is_output"] = True
+        X5._attrs["is_output"] = True
+        X5._attrs["name"] = "output2"
 
         target = detect_target()
-        module = compile_model(X4, target, "./tmp", test_name)
+        module = compile_model(
+            (X4, X5), target, "./tmp", f"{test_name}_{self.test_count}"
+        )
 
-        scores = self._create_tensors(shape)
+        scores = self._create_tensors(shape, dtype)
         (values, y_pt) = torch.topk(scores, k=topK, dim=dim)
-
+        torch_dtype = string_to_torch_dtype(dtype)
         x = scores.reshape(shape).contiguous()
-        y = torch.empty(o_shape).cuda().to(torch.int64)
-        module.run_with_tensors([x], [y])
-        self.assertTrue(torch.allclose(y_pt, y, atol=1e-2, rtol=1e-2))
+        y2 = torch.empty(o_shape).cuda().to(torch.int64)
+        y = torch.empty(o_shape).cuda().to(torch_dtype)
+        module.run_with_tensors([x], [y, y2])
+        torch.testing.assert_close(values, y, atol=0, rtol=0)
+        torch.testing.assert_close(y_pt, y2, atol=0, rtol=0)
+        self.test_count += 1
 
     def test_topk_heap(self):
-        self._test_topk(shape=(2000,), topK=100, test_name="topk_heap")
+        self._test_topk(shape=(2000,), topK=30, test_name="topk_heap")
         self._test_topk(
-            shape=(2000,), topK=100, test_name="topk_heap_copy_op", copy_op=True
+            shape=(2000,), topK=40, test_name="topk_heap_copy_op", copy_op=True
         )
-        self._test_topk(shape=(4, 500), topK=100, dim=1, test_name="topk_heap2")
+        self._test_topk(shape=(4, 500), topK=50, dim=1, test_name="topk_heap2")
         self._test_topk(
             shape=(4, 500),
-            topK=100,
+            topK=2,
             dim=1,
             test_name="topk_heap2_copy_op",
             copy_op=True,
@@ -96,6 +116,25 @@ def test_topk_sort(self):
             copy_op=True,
         )
 
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported by ROCm.")
+    def test_float32(self):
+        self._test_topk(
+            shape=(4, 500),
+            topK=200,
+            dim=1,
+            test_name="topk_sort_f32",
+            copy_op=False,
+            dtype="float32",
+        )
+        self._test_topk(
+            shape=(4, 500),
+            topK=30,
+            dim=1,
+            test_name="topk_heap_f32",
+            copy_op=False,
+            dtype="float32",
+        )
+
 
 if __name__ == "__main__":
     torch.manual_seed(1024)
diff --git a/tests/unittest/ops/test_transpose.py b/tests/unittest/ops/test_transpose.py
new file mode 100644
index 000000000..a480fe63b
--- /dev/null
+++ b/tests/unittest/ops/test_transpose.py
@@ -0,0 +1,98 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+from typing import Sequence
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from parameterized import param, parameterized
+
+
+@unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
+class TransposeTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super(TransposeTest, self).__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def _test_transpose_static_shape(
+        self,
+        input_shape: Sequence[int],
+        dim0: int,
+        dim1: int,
+        dtype: str = "float16",
+        test_name: str = "transpose_static_shape",
+    ) -> None:
+        X = Tensor(
+            shape=input_shape,
+            name="X",
+            dtype=dtype,
+            is_input=True,
+        )
+        op = ops.transpose()
+        Y = op(X, dim0, dim1)
+        Y._attrs["is_output"] = True
+        Y._attrs["name"] = "output"
+        target = detect_target()
+        module = compile_model(Y, target, "./tmp", f"{test_name}_{self._test_id}")
+        self._test_id += 1
+
+        X_pt = get_random_torch_tensor(input_shape, dtype=dtype)
+        Y_pt = torch.transpose(X_pt, dim0, dim1).contiguous()
+
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y])
+
+        torch.testing.assert_close(y, Y_pt, atol=1e-2, rtol=1e-2)
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), 1, 2),
+            param((80, 300, 2), 2, -2),
+            param((32, 12, 4096, 64), 2, 1),
+            param((128, 512), -1, -2),
+            param((128, 512), 0, 0),
+        ]
+    )
+    def test_transpose_static_shape_fp16(self, input_shape, dim0, dim1):
+        self._test_transpose_static_shape(
+            input_shape=input_shape,
+            dim0=dim0,
+            dim1=dim1,
+            test_name="test_transpose_static_shape_fp16",
+            dtype="float16",
+        )
+
+    @parameterized.expand(
+        [
+            param((80, 300, 2), 1, 2),
+        ]
+    )
+    def test_transpose_static_shape_fp32(self, input_shape, dim0, dim1):
+        self._test_transpose_static_shape(
+            input_shape=input_shape,
+            dim0=dim0,
+            dim1=dim1,
+            test_name="test_transpose_static_shape_fp32",
+            dtype="float32",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d.py b/tests/unittest/ops/test_transpose_conv2d.py
index e36878b21..363418ece 100644
--- a/tests/unittest/ops/test_transpose_conv2d.py
+++ b/tests/unittest/ops/test_transpose_conv2d.py
@@ -19,21 +19,32 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
-class conv2dTransposeTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=32, copy_op=False):
+class Conv2dTransposeTestCase(unittest.TestCase):
+    def _test_transpose_conv2d(
+        self,
+        batch=32,
+        copy_op=False,
+        test_name="transpose_conv2d",
+        dtype="float16",
+    ):
         target = detect_target()
-        if target.name() == "cuda" and int(target._arch) < 80:
-            return
         X = Tensor(
             shape=[IntImm(batch), 28, 28, 256],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[256, 2, 2, 256],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
         )
         OP = ops.transposed_conv2d(stride=2, pad=0, dilate=1)
         if copy_op:
@@ -41,23 +52,47 @@ def _test_fp16(self, batch=32, copy_op=False):
         Y = OP(X, W)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 28, 28).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 256, 28, 28], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 256, 2, 2], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 56, 56, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors({"input_0": x, "input_1": w}, [y])
         y_transpose = y.permute((0, 3, 1, 2))
-        self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+        if dtype == "float32":
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+        else:
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        self._test_transpose_conv2d(
+            test_name="transpose_conv2d_fp16",
+            dtype="float16",
+        )
+        self._test_transpose_conv2d(
+            copy_op=True,
+            test_name="transpose_conv2d_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_fp32_sm80(self):
+        self._test_transpose_conv2d(
+            test_name="transpose_conv2d_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d(
+            copy_op=True,
+            test_name="transpose_conv2d_fp32_copy_op",
+            dtype="float32",
+        )
+
 
+filter_test_cases_by_test_env(Conv2dTransposeTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias.py b/tests/unittest/ops/test_transpose_conv2d_bias.py
index 5ab0b6f70..5172c8df2 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias.py
@@ -19,55 +19,103 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class conv2dTransposeTestCase(unittest.TestCase):
-    def _test_fp16(self, batch=4, copy_op=False):
+class Conv2dTransposeBiasTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        torch.manual_seed(0)
+
+    def _test_transpose_conv2d_bias(
+        self,
+        batch=32,
+        c_in=256,
+        c_out=256,
+        copy_op=False,
+        test_name="transpose_conv2d_bias",
+        dtype="float16",
+    ):
         target = detect_target()
-        if int(target._arch) < 80:
-            return
         X = Tensor(
-            shape=[IntImm(batch), 14, 14, 256],
-            dtype="float16",
+            shape=[IntImm(batch), 14, 14, c_in],
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[c_in, 2, 2, c_out],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[c_out],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.transposed_conv2d_bias(stride=2, pad=0, dilate=1)
         if copy_op:
             OP = ops.transposed_conv2d_bias(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, c_in, 14, 14], dtype=dtype)
+        W_pt = get_random_torch_tensor([c_in, c_out, 2, 2], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, c_out, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
         Y_pt = Y_pt + B_pt
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(
             {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
         )
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=2e-1, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
     def test_fp16(self):
-        self._test_fp16()
-        self._test_fp16(copy_op=True)
+        for c_out in [192, 256]:
+            self._test_transpose_conv2d_bias(
+                c_out=c_out,
+                test_name="transpose_conv2d_bias_fp16",
+                dtype="float16",
+            )
+        self._test_transpose_conv2d_bias(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_fp32_sm80(self):
+        self._test_transpose_conv2d_bias(
+            test_name="transpose_conv2d_bias_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d_bias(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_fp32_copy_op",
+            dtype="float32",
+        )
+
 
+filter_test_cases_by_test_env(Conv2dTransposeBiasTestCase)
 
 if __name__ == "__main__":
-    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
index d5489f0bd..3ea8c63f6 100644
--- a/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
+++ b/tests/unittest/ops/test_transpose_conv2d_bias_relu.py
@@ -19,49 +19,95 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_test_env,
+    get_random_torch_tensor,
+)
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class conv2dTransposeTestCase(unittest.TestCase):
-    def test_fp16(self, batch=4):
+class Conv2dTransposeBiasReluTestCase(unittest.TestCase):
+    def _test_transpose_conv2d_bias_relu(
+        self,
+        batch=32,
+        copy_op=False,
+        test_name="transpose_conv2d_bias_relu",
+        dtype="float16",
+    ):
         target = detect_target()
-        if int(target._arch) < 80:
-            return
         X = Tensor(
             shape=[IntImm(batch), 14, 14, 256],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
         W = Tensor(
-            shape=[256, 2, 2, 256], dtype="float16", name="input_1", is_input=True
+            shape=[256, 2, 2, 256],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+        B = Tensor(
+            shape=[256],
+            dtype=dtype,
+            name="input_2",
+            is_input=True,
         )
-        B = Tensor(shape=[256], dtype="float16", name="input_2", is_input=True)
         OP = ops.transposed_conv2d_bias_relu(stride=2, pad=0, dilate=1)
+        if copy_op:
+            OP = ops.transposed_conv2d_bias_relu(**OP._get_op_attributes())
         Y = OP(X, W, B)
         Y._attrs["name"] = "output_0"
         Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", "transpose_conv2d_bias")
+        module = compile_model(Y, target, "./tmp", test_name)
 
-        X_pt = torch.randn(batch, 256, 14, 14).cuda().half()
-        W_pt = torch.randn(256, 256, 2, 2).cuda().half()
-        B_pt = torch.randn(1, 256, 1, 1).cuda().half()
+        X_pt = get_random_torch_tensor([batch, 256, 14, 14], dtype=dtype)
+        W_pt = get_random_torch_tensor([256, 256, 2, 2], dtype=dtype)
+        B_pt = get_random_torch_tensor([1, 256, 1, 1], dtype=dtype)
         Y_pt = torch.nn.functional.conv_transpose2d(X_pt, W_pt, padding=0, stride=2)
         Y_pt = Y_pt + B_pt
         Y_pt = torch.relu(Y_pt)
 
         x = X_pt.permute((0, 2, 3, 1)).contiguous()
         w = W_pt.permute((0, 2, 3, 1)).contiguous()
-        y = torch.empty([batch, 28, 28, 256]).cuda().half()
+        y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
         module.run_with_tensors(
             {"input_0": x, "input_1": w, "input_2": B_pt.squeeze()}, [y]
         )
         y_transpose = y.permute((0, 3, 1, 2))
         if target.name() == "cuda":
-            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+            if dtype == "float32":
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=5e-2, rtol=1e-2))
+            else:
+                self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
         else:
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1.25e-1, rtol=1e-1))
 
+    def test_fp16(self):
+        self._test_transpose_conv2d_bias_relu(
+            test_name="transpose_conv2d_bias_relu_fp16",
+            dtype="float16",
+        )
+        self._test_transpose_conv2d_bias_relu(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_relu_fp16_copy_op",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_fp32_sm80(self):
+        self._test_transpose_conv2d_bias_relu(
+            test_name="transpose_conv2d_bias_relu_fp32",
+            dtype="float32",
+        )
+        self._test_transpose_conv2d_bias_relu(
+            copy_op=True,
+            test_name="transpose_conv2d_bias_relu_fp32_copy_op",
+            dtype="float32",
+        )
+
+
+filter_test_cases_by_test_env(Conv2dTransposeBiasReluTestCase)
 
 if __name__ == "__main__":
     torch.manual_seed(0)
diff --git a/tests/unittest/ops/test_tuple_list_construct.py b/tests/unittest/ops/test_tuple_list_construct.py
index a4a4b539a..406df8ff7 100644
--- a/tests/unittest/ops/test_tuple_list_construct.py
+++ b/tests/unittest/ops/test_tuple_list_construct.py
@@ -15,25 +15,27 @@
 import unittest
 
 import torch
-from aitemplate.compiler import compile_model, ops
 
+from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntVar, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
-class TupleConstructTestCase(unittest.TestCase):
-    def _test_tuple_construct(
+class TupleListConstructTestCase(unittest.TestCase):
+    def _test_construct(
         self,
         batch_size=(1, 3),
         X_shape=(16, 32, 64),
         test_op=ops.tuple_construct,
-        test_name="tuple",
+        test_name="tuple_construct",
+        dtype="float16",
     ):
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), *X_shape],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -57,15 +59,15 @@ def _test_tuple_construct(
 
         for b in batch_size:
             X_shape_pt = (b, *X_shape)
-            X_pt = torch.randn(X_shape_pt).cuda().half()
+            X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
             Y1_pt = X_pt.reshape(-1, X_shape_pt[-1])
             Y2_pt = X_pt.flatten()
             Y3_pt = Y2_pt.unsqueeze(1)
 
             outputs = [
-                torch.empty(Y1_pt.size()).cuda().half(),
-                torch.empty(Y2_pt.size()).cuda().half(),
-                torch.empty(Y3_pt.size()).cuda().half(),
+                torch.empty_like(Y1_pt),
+                torch.empty_like(Y2_pt),
+                torch.empty_like(Y3_pt),
             ]
             module.run_with_tensors([X_pt], outputs)
 
@@ -73,10 +75,19 @@ def _test_tuple_construct(
             self.assertTrue(torch.allclose(Y2_pt, outputs[1], atol=1e-2, rtol=1e-2))
             self.assertTrue(torch.allclose(Y3_pt, outputs[2], atol=1e-2, rtol=1e-2))
 
-    def test_tuple_construct(self):
-        self._test_tuple_construct(test_op=ops.tuple_construct, test_name="tuple_0")
-        self._test_tuple_construct(test_op=ops.list_construct, test_name="list_0")
+    def test_construct_fp16(self):
+        self._test_construct(
+            test_op=ops.tuple_construct,
+            test_name="construct_fp16_tuple",
+            dtype="float16",
+        )
+        self._test_construct(
+            test_op=ops.list_construct,
+            test_name="construct_fp16_list",
+            dtype="float16",
+        )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d_add.py b/tests/unittest/ops/test_upsamping2d_add.py
deleted file mode 100644
index f4e23105f..000000000
--- a/tests/unittest/ops/test_upsamping2d_add.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#  Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import unittest
-
-import torch
-
-from aitemplate.compiler import compile_model
-from aitemplate.frontend import IntVar, nn, Tensor
-from aitemplate.testing import detect_target
-
-
-_DEFAULT_BATCH_SIZE = [1, 16]
-
-
-class UpsamplingAddTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
-        self,
-        scale_factor=2.0,
-        mode="bilinear",
-        channels=1024,
-        batch_size=_DEFAULT_BATCH_SIZE,
-        test_name="upsampling2d_add",
-    ):
-        HH, WW = 32, 32
-        target = detect_target()
-        X = Tensor(
-            shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
-            dtype="float16",
-            name="input_0",
-            is_input=True,
-        )
-
-        R = Tensor(
-            shape=[
-                IntVar(values=batch_size, name="input_batch_r"),
-                int(HH * scale_factor),
-                int(WW * scale_factor),
-                channels,
-            ],
-            dtype="float16",
-            name="input_1",
-            is_input=True,
-        )
-
-        OP = nn.Upsampling2dAdd(scale_factor=scale_factor, mode=mode)
-        Y = OP(X, R)
-        Y._attrs["name"] = "output_0"
-        Y._attrs["is_output"] = True
-        module = compile_model(Y, target, "./tmp", test_name)
-
-        for b in batch_size:
-            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
-            R_pt = (
-                torch.randn(b, channels, int(HH * scale_factor), int(WW * scale_factor))
-                .cuda()
-                .half()
-            )
-
-            Y_pt = (
-                torch.nn.functional.interpolate(
-                    X_pt, scale_factor=scale_factor, mode=mode
-                )
-                + R_pt
-            )
-
-            x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
-            r = torch.permute(R_pt, (0, 2, 3, 1)).contiguous()
-            y = (
-                torch.empty(
-                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
-                )
-                .cuda()
-                .half()
-            )
-            module.run_with_tensors({"input_0": x, "input_1": r}, [y])
-            y_tranpose = torch.permute(y, (0, 3, 1, 2))
-            self.assertTrue(torch.allclose(Y_pt, y_tranpose, atol=1e-2, rtol=1e-2))
-
-    def test_bilinear_upsample_add(self):
-        self._test_fp16_single_op(
-            scale_factor=2.0, test_name="bilinear_upsampling2d_add"
-        )
-
-    def test_nearest_upsample_add(self):
-        self._test_fp16_single_op(
-            scale_factor=3.0, mode="nearest", test_name="nearest_add1"
-        )
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", channels=514, test_name="nearest_add2"
-        )
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", channels=231, test_name="nearest_add3"
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/unittest/ops/test_upsamping2d.py b/tests/unittest/ops/test_upsampling2d.py
similarity index 60%
rename from tests/unittest/ops/test_upsamping2d.py
rename to tests/unittest/ops/test_upsampling2d.py
index 41cd32883..702fd6bd5 100644
--- a/tests/unittest/ops/test_upsamping2d.py
+++ b/tests/unittest/ops/test_upsampling2d.py
@@ -19,24 +19,32 @@
 from aitemplate.compiler import compile_model
 from aitemplate.frontend import IntVar, nn, Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    get_random_torch_tensor,
+    TestEnv,
+)
+from parameterized import parameterized
+
 
 _DEFAULT_BATCH_SIZE = [1, 3]
 
 
 class UpsamplingTestCase(unittest.TestCase):
-    def _test_fp16_single_op(
+    def _test_single_op(
         self,
         scale_factor=2.0,
         mode="bilinear",
         batch_size=_DEFAULT_BATCH_SIZE,
-        test_name="bilinear_upsampling2d",
+        test_name="bilinear_upsampling2d_fp16",
+        dtype="float16",
     ):
         channels = 1024
         HH, WW = 8, 8
         target = detect_target()
         X = Tensor(
             shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
-            dtype="float16",
+            dtype=dtype,
             name="input_0",
             is_input=True,
         )
@@ -47,32 +55,42 @@ def _test_fp16_single_op(
         module = compile_model(Y, target, "./tmp", test_name)
 
         for b in batch_size:
-            X_pt = torch.randn(b, channels, HH, WW).cuda().half()
+            X_pt = get_random_torch_tensor([b, channels, HH, WW], dtype=dtype)
             Y_pt = torch.nn.functional.interpolate(
                 X_pt, scale_factor=scale_factor, mode=mode
             )
             x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
-            y = (
-                torch.empty(
-                    [b, int(HH * scale_factor), int(WW * scale_factor), channels]
-                )
-                .cuda()
-                .half()
-            )
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
             module.run_with_tensors([x], [y])
             y_transpose = torch.permute(y, (0, 3, 1, 2))
             self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
 
-    def test_bilinear_upsample(self):
-        self._test_fp16_single_op(
-            scale_factor=3.5, mode="bilinear", test_name="bilinear_upsampling2d"
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [("float16"), ("float32")],
+                TestEnv.CUDA_SM80: [("bfloat16")],
+                TestEnv.ROCM: [("float16")],
+            }
         )
-
-    def test_nearest_upsample(self):
-        self._test_fp16_single_op(
-            scale_factor=2.0, mode="nearest", test_name="nearest_upsampling2d"
+    )
+    def test_upsampling2d_constructor(self, ait_dtype):
+        # Currently upsampling2d bilinear does not support bfloat16.
+        if ait_dtype != "bfloat16":
+            self._test_single_op(
+                scale_factor=3.5,
+                mode="bilinear",
+                test_name=f"bilinear_upsampling2d_{ait_dtype}",
+                dtype=ait_dtype,
+            )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            test_name=f"nearest_upsampling2d_{ait_dtype}",
+            dtype=ait_dtype,
         )
 
 
 if __name__ == "__main__":
+    torch.manual_seed(0)
     unittest.main()
diff --git a/tests/unittest/ops/test_upsampling2d_add.py b/tests/unittest/ops/test_upsampling2d_add.py
new file mode 100644
index 000000000..a53fa51e4
--- /dev/null
+++ b/tests/unittest/ops/test_upsampling2d_add.py
@@ -0,0 +1,154 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model
+from aitemplate.frontend import IntVar, nn, Tensor
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+
+
+_DEFAULT_BATCH_SIZE = [1, 3]
+
+
+class UpsamplingAddTestCase(unittest.TestCase):
+    def _test_single_op(
+        self,
+        scale_factor=2.0,
+        mode="bilinear",
+        channels=1024,
+        batch_size=_DEFAULT_BATCH_SIZE,
+        test_name="bilinear_upsampling2d_add_fp16",
+        dtype="float16",
+    ):
+        HH, WW = 32, 32
+        target = detect_target()
+        X = Tensor(
+            shape=[IntVar(values=batch_size, name="input_batch"), HH, WW, channels],
+            dtype=dtype,
+            name="input_0",
+            is_input=True,
+        )
+
+        R = Tensor(
+            shape=[
+                IntVar(values=batch_size, name="input_batch_r"),
+                int(HH * scale_factor),
+                int(WW * scale_factor),
+                channels,
+            ],
+            dtype=dtype,
+            name="input_1",
+            is_input=True,
+        )
+
+        OP = nn.Upsampling2dAdd(scale_factor=scale_factor, mode=mode)
+        Y = OP(X, R)
+        Y._attrs["name"] = "output_0"
+        Y._attrs["is_output"] = True
+        module = compile_model(Y, target, "./tmp", test_name)
+
+        for b in batch_size:
+            X_pt = get_random_torch_tensor([b, channels, HH, WW], dtype=dtype)
+            R_pt = get_random_torch_tensor(
+                [
+                    b,
+                    channels,
+                    int(HH * scale_factor),
+                    int(WW * scale_factor),
+                ],
+                dtype=dtype,
+            )
+
+            Y_pt = (
+                torch.nn.functional.interpolate(
+                    X_pt, scale_factor=scale_factor, mode=mode
+                )
+                + R_pt
+            )
+
+            x = torch.permute(X_pt, (0, 2, 3, 1)).contiguous()
+            r = torch.permute(R_pt, (0, 2, 3, 1)).contiguous()
+            y = torch.empty_like(Y_pt).permute((0, 2, 3, 1)).contiguous()
+            module.run_with_tensors({"input_0": x, "input_1": r}, [y])
+            y_transpose = torch.permute(y, (0, 3, 1, 2))
+            self.assertTrue(torch.allclose(Y_pt, y_transpose, atol=1e-2, rtol=1e-2))
+
+    def test_bilinear_upsample_add_fp16(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            test_name="bilinear_upsampling2d_add_fp16",
+            dtype="float16",
+        )
+
+    def test_nearest_upsample_add_fp16(self):
+        self._test_single_op(
+            scale_factor=3.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_add_fp16_1",
+            dtype="float16",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=514,
+            test_name="nearest_upsampling2d_add_fp16_2",
+            dtype="float16",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=231,
+            test_name="nearest_upsampling2d_add_fp16_3",
+            dtype="float16",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_bilinear_upsample_add_fp32(self):
+        self._test_single_op(
+            scale_factor=2.0,
+            test_name="bilinear_upsampling2d_add_fp32",
+            dtype="float32",
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_nearest_upsample_add_fp32(self):
+        self._test_single_op(
+            scale_factor=3.0,
+            mode="nearest",
+            test_name="nearest_upsampling2d_add_fp32_1",
+            dtype="float32",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=514,
+            test_name="nearest_upsampling2d_add_fp32_2",
+            dtype="float32",
+        )
+        self._test_single_op(
+            scale_factor=2.0,
+            mode="nearest",
+            channels=231,
+            test_name="nearest_upsampling2d_add_fp32_3",
+            dtype="float32",
+        )
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/ops/test_vanilla_attention.py b/tests/unittest/ops/test_vanilla_attention.py
index 3a57f72af..63149aa45 100644
--- a/tests/unittest/ops/test_vanilla_attention.py
+++ b/tests/unittest/ops/test_vanilla_attention.py
@@ -13,8 +13,9 @@
 #  limitations under the License.
 #
 """
-Unittests for vanilla_attenion.
+Unittests for vanilla_attention.
 """
+import logging
 import math
 import os
 import unittest
@@ -26,10 +27,13 @@
 from aitemplate.frontend import nn, Tensor
 from aitemplate.frontend.nn.vanilla_attention import vanilla_attention
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger, shape_utils
+from aitemplate.utils import shape_utils
 from einops import rearrange
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 def mark_output(y):
     if type(y) is not tuple:
         y = (y,)
@@ -77,7 +81,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     qkv_pt = torch.permute(qkv_pt, [2, 0, 3, 1, 4])  # [3, 1, 12, 4096, 64]
 
     q_pt, k_pt, v_pt = torch.split(qkv_pt, 1, dim=0)  # [1, 1, 12, 4096, 64]
-    scale_pt = torch.tensor(64 ** -0.5)
+    scale_pt = torch.tensor(64**-0.5)
     q_pt = q_pt * (scale_pt)
     # #[12, 4096, 64] * [12, 64, 4096] => [12, 4096, 4096]
     attn_pt = torch.bmm(
@@ -92,7 +96,7 @@ def attention_pt(X_pt, W_pt, B_pt, nheads, d, seqlen):
     return Y_pt
 
 
-class vanillaAttentionTestCase(unittest.TestCase):
+class VanillaAttentionTestCase(unittest.TestCase):
     def _test_vanilla_attention(
         self,
         batch_size=16,
@@ -199,9 +203,7 @@ def _test_vanilla_attention(
                 [y],
                 count=100,
             )
-            logger.info(
-                __file__, "benchmark vanilla-attn time: {0}".format(time_per_iter_ms)
-            )
+            _LOGGER.info("benchmark vanilla-attn time: {0}".format(time_per_iter_ms))
 
         self.assertTrue(torch.allclose(y_pt.half(), y, atol=1e-1, rtol=1e-1))
 
@@ -234,6 +236,7 @@ def _test_mha(
         num_heads=2,
         use_fp16_acc=False,
         benchmark_ait=False,
+        name="cross_attn_dynamic",
     ):
         pt_mod = (
             torch.nn.MultiheadAttention(
@@ -287,7 +290,7 @@ def _test_mha(
         Y = Y + inputs_ait
         mark_output(Y)
         target = detect_target(use_fp16_acc=False)
-        exe_module = compile_model(Y, target, "./tmp", "cross_attn_dynamic")
+        exe_module = compile_model(Y, target, "./tmp", name)
         for name, weight in params_ait.items():
             exe_module.set_constant_with_tensor(name, weight)
 
@@ -317,17 +320,32 @@ def _test_mha(
                     ys,
                     count=100,
                 )
-                logger.info(
-                    __file__, "benchmark cross-attn time: {0}".format(time_per_iter_ms)
-                )
+                _LOGGER.info("benchmark cross-attn time: {0}".format(time_per_iter_ms))
 
     def test_cross_attn(self):
-        self._test_mha(batch_sizes=[1], seqlen=2, seqlen_kv=32, dim=512, num_heads=8)
         self._test_mha(
-            batch_sizes=[128, 256, 512], seqlen=1, seqlen_kv=62, dim=512, num_heads=8
+            batch_sizes=[1],
+            seqlen=2,
+            seqlen_kv=32,
+            dim=512,
+            num_heads=8,
+            name="single_batch",
+        )
+        self._test_mha(
+            batch_sizes=[128, 256, 512],
+            seqlen=1,
+            seqlen_kv=62,
+            dim=512,
+            num_heads=8,
+            name="batches_seq_1",
         )
         self._test_mha(
-            batch_sizes=[1, 32, 64], seqlen=128, seqlen_kv=62, dim=512, num_heads=8
+            batch_sizes=[1, 32, 64],
+            seqlen=128,
+            seqlen_kv=62,
+            dim=512,
+            num_heads=8,
+            name="batches_seq_128",
         )
 
 
diff --git a/tests/unittest/ops/test_var.py b/tests/unittest/ops/test_var.py
index edf92b362..a8068e09c 100644
--- a/tests/unittest/ops/test_var.py
+++ b/tests/unittest/ops/test_var.py
@@ -21,13 +21,15 @@
 from aitemplate.compiler import compile_model, ops
 from aitemplate.frontend import IntImm, IntVar, Tensor
 from aitemplate.testing import detect_target
-from aitemplate.testing.test_utils import dtype_to_torch_dtype, get_random_torch_tensor
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 @unittest.skipIf(detect_target().name() == "rocm", "Not supported by ROCM.")
 class VarTestCase(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super(VarTestCase, self).__init__(*args, **kwargs)
+        self.test_count = 0
 
     def _run_var(
         self,
@@ -39,6 +41,8 @@ def _run_var(
         input_type="float16",
         output_type=None,
         copy_op=False,
+        atol=1e-2,
+        rtol=1e-2,
     ):
         torch.manual_seed(0)
         logging.info(
@@ -61,19 +65,25 @@ def _run_var(
         logging.info("AITemplate output_shape: {}".format(y_shape))
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        test_name = "var"
-        module = compile_model(Y, target, "./tmp", test_name)
+        module = compile_model(Y, target, "./tmp", f"var_{self.test_count}")
         X_pt = get_random_torch_tensor(input_shape, input_type)
-        Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
+        if output_type is None:
+            torch_dtype = None
+        else:
+            torch_dtype = string_to_torch_dtype(output_type)
+        Y_pt = torch.var(
+            X_pt.to(dtype=torch_dtype), dim=dim, unbiased=unbiased, keepdim=keepdim
+        )
 
-        y = torch.empty(y_shape).cuda().half()
+        y = torch.empty_like(Y_pt)
         module.run_with_tensors([X_pt], [y])
 
         np.testing.assert_equal(y_shape, Y_pt.size())
-        np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
-        self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2, equal_nan=True))
+        np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
+        self.assertTrue(torch.allclose(Y_pt, y, atol=atol, rtol=rtol, equal_nan=True))
+        self.test_count += 1
 
-    def test_var(self):
+    def test_var_float16(self):
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 1], keepdim=False)
         self._run_var(dim=-1, unbiased=False, input_shape=[1, 1], keepdim=False)
         self._run_var(dim=-1, unbiased=True, input_shape=[1, 5], keepdim=False)
@@ -95,7 +105,14 @@ def test_var(self):
         )
 
     def _run_batched_var(
-        self, *, dim, unbiased, keepdim=False, input_type="float16", output_type=None
+        self,
+        *,
+        dim,
+        unbiased,
+        keepdim=False,
+        input_type="float16",
+        output_type=None,
+        test_name="batched_var",
     ):
         torch.manual_seed(0)
         logging.info("Test batched_var with reduction_axes={dim}".format(dim=dim))
@@ -118,7 +135,6 @@ def _run_batched_var(
 
         logging.info("AITemplate output_type: {}".format(y_dtype))
 
-        test_name = "batched_var"
         module = compile_model(Y, target, "./tmp", test_name)
 
         for B in [5, 128, 1024, 1237, 2002]:
@@ -128,17 +144,116 @@ def _run_batched_var(
             X_pt = get_random_torch_tensor(input_shape, input_type)
             Y_pt = torch.var(X_pt, dim=dim, unbiased=unbiased, keepdim=keepdim)
 
-            y = torch.empty(Y_pt.size()).cuda().half()
+            y = torch.empty_like(Y_pt)
             module.run_with_tensors([X_pt], [y])
 
-            np.testing.assert_equal(dtype_to_torch_dtype(y_dtype), Y_pt.dtype)
+            np.testing.assert_equal(string_to_torch_dtype(y_dtype), Y_pt.dtype)
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+        self.test_count += 1
 
     def test_batched_var(self):
-        self._run_batched_var(dim=0, unbiased=False, keepdim=True)
-        self._run_batched_var(dim=1, unbiased=True, keepdim=False)
-        self._run_batched_var(dim=1, unbiased=False, keepdim=True)
-        self._run_batched_var(dim=2, unbiased=True, keepdim=False)
+        self._run_batched_var(
+            dim=0, unbiased=False, keepdim=True, test_name="batched_var_0"
+        )
+        self._run_batched_var(
+            dim=1, unbiased=True, keepdim=False, test_name="batched_var_1"
+        )
+        self._run_batched_var(
+            dim=1, unbiased=False, keepdim=True, test_name="batched_var_2"
+        )
+        self._run_batched_var(
+            dim=2, unbiased=True, keepdim=False, test_name="batched_var_3"
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "fp32 not supported in ROCm")
+    def test_var_float32(self):
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="float16",
+            output_type="float32",
+            atol=1.3e-6,
+            rtol=1e-5,
+        )
+
+    @unittest.skipIf(detect_target().name() == "rocm", "bf16 not supported in ROCm")
+    def test_var_bfloat16(self):
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[2, 8],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_var(
+            dim=-1,
+            unbiased=False,
+            input_shape=[3, 2, 2050],
+            keepdim=False,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
+        self._run_var(
+            dim=1,
+            unbiased=True,
+            input_shape=[1025, 2047],
+            keepdim=True,
+            input_type="bfloat16",
+            atol=1e-1,
+            rtol=1e-1,
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/ops/test_where.py b/tests/unittest/ops/test_where.py
new file mode 100644
index 000000000..37774ffe1
--- /dev/null
+++ b/tests/unittest/ops/test_where.py
@@ -0,0 +1,221 @@
+#  Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import torch
+
+from aitemplate.compiler import compile_model, ops
+from aitemplate.frontend import IntVar
+from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import (
+    filter_test_cases_by_params,
+    gen_input_tensor,
+    get_random_torch_tensor,
+    get_torch_empty_tensor,
+    TestEnv,
+)
+from parameterized import param, parameterized
+
+
+class TestWhere(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._test_id = 0
+
+    def test_unsupport_condition_tensor_non_bool(self) -> None:
+        X1 = gen_input_tensor([4, 4], name="X1", dtype="float")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(
+            AssertionError, "condition needs to be a bool tensor"
+        ):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_condition_tensor_constant(self) -> None:
+        X1 = 1
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(AssertionError, "condition needs to be a tensor"):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_different_condition_and_input_tensor_size(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float")
+        with self.assertRaisesRegex(AssertionError, "Tensor shape should be the same"):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_no_dtype_for_scalars(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = 2
+        X3 = 2
+        with self.assertRaisesRegex(
+            AssertionError, "dtype needs to be provided for scalars"
+        ):
+            ops.where()(X1, X2, X3)
+
+    def test_unsupport_tensor_of_different_dtype(self) -> None:
+        X1 = gen_input_tensor([4, 4], name="X1", dtype="bool")
+        X2 = gen_input_tensor([4, 4], name="X2", dtype="float32")
+        X3 = gen_input_tensor([4, 4], name="X3", dtype="float64")
+        with self.assertRaisesRegex(AssertionError, "Expect tensor of the same dtype"):
+            ops.where()(X1, X2, X3)
+
+    def test_dtype_for_scalars(self) -> None:
+        dim = IntVar([2, 128])
+        X1 = gen_input_tensor([dim, 4], name="X1", dtype="bool")
+        X2 = 2
+        X3 = 2
+        Y = ops.where()(X1, X2, X3, dtype="float32")
+        self.assertEqual(Y.dtype(), "float32")
+
+    @parameterized.expand(
+        **filter_test_cases_by_params(
+            {
+                TestEnv.CUDA_LESS_THAN_SM80: [param("float16", 3), param("float16", 2)],
+                TestEnv.CUDA_SM80: [
+                    param("bfloat16", 3),
+                    param("bfloat16", 2),
+                    param("float32", 8),
+                    param("float", 1),
+                    param("float", 3),
+                ],
+                TestEnv.ROCM: [param("float16", 3), param("float16", 2)],
+            }
+        )
+    )
+    def test_where(self, dtype: str, M: int) -> None:
+        dim = IntVar([2, 3, 128])
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", f"test_where_{self._test_id}")
+        self._test_id += 1
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_input_tensor_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = 2
+        X3 = gen_input_tensor([dim, M], name="X3", dtype=dtype)
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_input_tensor_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = 2
+            x3_pt = get_random_torch_tensor([batch, M], dtype)
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X3": x3_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_other_tensor_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = 2
+        Y = ops.where()(X1, X2, X3)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_other_tensor_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = 2
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_both_tensors_constant(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype="bool")
+        X2 = 4
+        X3 = 2
+        Y = ops.where()(X1, X2, X3, dtype=dtype)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_both_tensors_constant")
+
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype) < 0
+            x2_pt = 4
+            x3_pt = 2
+            y_pt = torch.where(x1_pt, x2_pt, x3_pt).to(torch.float32)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+    def test_integration_with_relational(self) -> None:
+        dim = IntVar([2, 3, 128])
+        dtype = "float"
+        M = 4
+        X1 = gen_input_tensor([dim, M], name="X1", dtype=dtype)
+        X2 = gen_input_tensor([dim, M], name="X2", dtype=dtype)
+        X3 = ops.ge()(X1, X2)
+        Y = ops.where()(X3, X1, X2)
+        Y._attrs["name"] = "Y"
+        Y._attrs["is_output"] = True
+
+        target = detect_target()
+        model = compile_model(Y, target, "./tmp", "test_integration_with_relational")
+        for batch in dim._attrs["values"]:
+            x1_pt = get_random_torch_tensor([batch, M], dtype)
+            x2_pt = get_random_torch_tensor([batch, M], dtype)
+            x3_pt = torch.ge(x1_pt, x2_pt)
+            y_pt = torch.where(x3_pt, x1_pt, x2_pt)
+            y = get_torch_empty_tensor([batch, M], dtype)
+            inputs = {"X1": x1_pt, "X2": x2_pt}
+            model.run_with_tensors(inputs, [y])
+            torch.testing.assert_close(y_pt, y)
+
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()
diff --git a/tests/unittest/util/test_debug_utils.py b/tests/unittest/util/test_debug_utils.py
index c0c4e63c1..a86059d91 100644
--- a/tests/unittest/util/test_debug_utils.py
+++ b/tests/unittest/util/test_debug_utils.py
@@ -21,10 +21,14 @@
 import torch
 
 from aitemplate.compiler import compile_model, ops
-from aitemplate.compiler.base import IntImm
+from aitemplate.compiler.base import IntImm, IntVarTensor
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
+from aitemplate.testing.test_utils import get_random_torch_tensor
+from aitemplate.utils import shape_utils
+from aitemplate.utils.debug_settings import AITDebugSettings
+from aitemplate.utils.torch_utils import string_to_torch_dtype
 
 
 def _test_inf_and_nan(
@@ -43,8 +47,9 @@ def _test_inf_and_nan(
     X2._attrs["check_nan_and_inf"] = check_tensor
 
     target = detect_target()
+    debug_settings = AITDebugSettings(check_all_nan_and_inf=check_all)
     module = compile_model(
-        X2, target, "./tmp", test_name, check_all_nan_and_inf=check_all
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
     )
 
     x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
@@ -63,11 +68,11 @@ def test_inf_and_nan(capfd):
 
 
 def _test_outputs(
-    check_tensor, check_all, test_name, capfd: pytest.CaptureFixture[str]
+    check_tensor, check_all, test_name, dtype, capfd: pytest.CaptureFixture[str]
 ):
     X1 = Tensor(
         shape=[IntImm(1), IntImm(3)],
-        dtype="float16",
+        dtype=dtype,
         name="input0",
         is_input=True,
     )
@@ -78,17 +83,25 @@ def _test_outputs(
     X2._attrs["check_outputs"] = check_tensor
 
     target = detect_target()
-    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+    debug_settings = AITDebugSettings(check_all_outputs=check_all)
+    module = compile_model(
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
+    )
 
-    x1_pt = torch.Tensor([[1.0, 1.5, 2.0]]).cuda().half()
+    x1_pt = (
+        torch.Tensor([[1.0, 1.5, 2.0]])
+        .to(dtype=string_to_torch_dtype(dtype))
+        .to("cuda")
+    )
     x2 = torch.empty_like(x1_pt)
     module.run_with_tensors([x1_pt], [x2])
 
     out, _ = capfd.readouterr()
     output_str = "Tensor (output0) output:"
-    assert out.find(output_str) != -1
+    idx = out.find(output_str)
+    assert idx != -1
 
-    out = out[len(output_str) :].strip()
+    out = out[idx + len(output_str) :].strip()
     values = out.split(", ")
     assert len(values) == 3, f"Got {len(values)} outputs, expected 3"
 
@@ -100,9 +113,65 @@ def _test_outputs(
 
 
 def test_outputs(capfd):
-    _test_outputs(True, False, "test_outputs_tensor", capfd)
-    _test_outputs(False, True, "test_outputs_all", capfd)
-    _test_outputs(True, True, "test_outputs_both", capfd)
+    _test_outputs(True, False, "test_outputs_tensor", "float16", capfd)
+    _test_outputs(False, True, "test_outputs_all", "float16", capfd)
+    _test_outputs(True, True, "test_outputs_both_float16", "float16", capfd)
+    _test_outputs(True, True, "test_outputs_both_float32", "float32", capfd)
+
+
+@pytest.mark.skipif(
+    detect_target().name == "rocm" or int(detect_target()._arch) < 80,
+    reason="bfloat16 tests requires CUDA sm >= 80",
+)
+def test_outputs_bf16(capfd):
+    _test_outputs(True, True, "test_outputs_both_bfloat16", "bfloat16", capfd)
+
+
+def _test_with_int_var_tensor(test_name, dtype):
+    target = detect_target()
+    batch_size = (3, 5)
+    x1_size = (2, 3)
+    X_shape = (32, 64)
+    b_dim = shape_utils.gen_int_var_min_max(batch_size, name="input_batch")
+    x1_dim = shape_utils.gen_int_var_min_max(x1_size, name="input_size")
+    X = Tensor(
+        shape=[b_dim, x1_dim, *X_shape],
+        dtype=dtype,
+        name="input_0",
+        is_input=True,
+    )
+
+    Y1 = ops.size()(X)
+    Y2 = ops.getitem()(Y1, 0)
+    Y3 = ops.getitem()(Y1, 1)
+    Y4 = ops.getitem()(Y1, 2)
+    Y5 = ops.getitem()(Y1, 3)
+    f1 = ops.int_elementwise(FuncEnum.MUL)(Y4, Y5)
+    f2 = IntVarTensor(IntImm(12))
+
+    Y = ops.reshape()(X, [Y2 * Y3 * f1 / f2, f2])
+    Y._attrs["name"] = "output_0"
+    Y._attrs["is_output"] = True
+    debug_settings = AITDebugSettings(
+        check_all_outputs=True, check_all_nan_and_inf=True
+    )
+    module = compile_model(Y, target, "./tmp", test_name, debug_settings=debug_settings)
+
+    for b, x1 in zip(batch_size, x1_size):
+        X_shape_pt = (b, x1, *X_shape)
+        X_pt = get_random_torch_tensor(X_shape_pt, dtype=dtype)
+        Y_pt = X_pt.reshape(
+            int(X_shape_pt[0] * X_shape_pt[1] * X_shape_pt[2] * X_shape_pt[3] / 12),
+            12,
+        )
+
+        y = torch.empty_like(Y_pt)
+        module.run_with_tensors([X_pt], [y])
+        assert torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2)
+
+
+def test_int_var_tensor(capfd):
+    _test_with_int_var_tensor("test_outputs_int_var_tensor", "float16")
 
 
 def _test_special_outputs(
@@ -121,7 +190,10 @@ def _test_special_outputs(
     X2._attrs["check_outputs"] = check_tensor
 
     target = detect_target()
-    module = compile_model(X2, target, "./tmp", test_name, check_all_outputs=check_all)
+    debug_settings = AITDebugSettings(check_all_outputs=check_all)
+    module = compile_model(
+        X2, target, "./tmp", test_name, debug_settings=debug_settings
+    )
 
     x1_pt = torch.Tensor([[1.0, -2.0, 0.0]]).cuda().half()
     x2 = torch.empty_like(x1_pt)
diff --git a/tests/unittest/util/test_serdes.py b/tests/unittest/util/test_serdes.py
index c800e4711..284391a72 100644
--- a/tests/unittest/util/test_serdes.py
+++ b/tests/unittest/util/test_serdes.py
@@ -15,6 +15,7 @@
 """
 Unittests for special activation Operator.
 """
+import logging
 import unittest
 
 import torch
@@ -24,7 +25,7 @@
 from aitemplate.compiler.ops.common.epilogue import FuncEnum
 from aitemplate.frontend import Tensor
 from aitemplate.testing import detect_target
-from aitemplate.utils import logger
+from aitemplate.testing.test_utils import filter_test_cases_by_test_env
 from aitemplate.utils.serialization.serdes_code import (
     dump_program,
     get_inputs_from_graph,
@@ -32,6 +33,9 @@
 )
 
 
+_LOGGER = logging.getLogger(__name__)
+
+
 class SerDesTestCase(unittest.TestCase):
     def test_get_inputs(self):
         X1 = Tensor(
@@ -206,12 +210,8 @@ def test_reshape(self):
         module.run_with_tensors([X_pt], [y])
         self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
 
-    def test_group_gemm_rcr(self):
+    def test_group_gemm_rcr_sm80(self):
         target = detect_target()
-        if int(target._arch) < 80:
-            logger.warning(__file__, "Group Gemm need SM80 HW")
-            return
-
         M = 256
         K1 = 128
         N1 = 60
@@ -288,3 +288,11 @@ def test_dynamic_slice(self):
             y = torch.empty(y_pt.shape).cuda().half()
             module.run_with_tensors([X_pt], [y])
             self.assertTrue(torch.allclose(Y_pt, y, atol=1e-2, rtol=1e-2))
+
+
+filter_test_cases_by_test_env(SerDesTestCase)
+filter_test_cases_by_test_env(SerDesSpecialOpTestCase)
+
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    unittest.main()